NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
#Steroidobacterales
NEON_MAGs_metagenomes_chemistry_Steroidobacterales<- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Order`,"Steroidobacterales"))
#Burkholderiales
NEON_MAGs_metagenomes_chemistry_Burkholderiales <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Order`,"Burkholderiales"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
#Toolik
NEON_MAGs_metagenomes_chemistry_TOOL<- NEON_MAGs_metagenomes_chemistry %>%
filter(`Site ID.x` == "TOOL") %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_TOOL_Novel <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
TOOL_MAGs_label <- NEON_MAGs_metagenomes_chemistry_TOOL$label
tree_bac_TOOL_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(TOOL_MAGs_label, tree_bac$tip.label)])
# Make a vector with the internal node lables
node_vector_bac_TOOL_MAGS = c(tree_bac_TOOL_MAGs$tip.label,tree_bac_TOOL_MAGs$node.label)
NEON_MAGs_metagenomes_chemistry_Gamma_noblank <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
rename("Orders" = "Order") %>%
rename("Phyla" = "Phylum") %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("WaterpH" ="soilInWaterpH") %>%
rename("Temp" ="soilTemp") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
rename("GeneCount" = "Gene Count") %>%
rename("GCassembled" = "GC * assembled")
NEON_MAGs_metagenomes_chemistry_TOOL_noblank <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
rename("Phyla" = "Phylum") %>%
rename("AssemblyType" = "Assembly Type") %>%
rename("WaterpH" ="soilInWaterpH") %>%
rename("Temp" ="soilTemp") %>%
rename("BinCompleteness" = "Bin Completeness") %>%
rename("BinContamination" = "Bin Contamination") %>%
rename("TotalNumberofBases" = "Total Number of Bases") %>%
rename("EcosystemSubtype" = "Ecosystem Subtype") %>%
rename("GeneCount" = "Gene Count") %>%
rename("GCassembled" = "GC * assembled")
NEON_MAGs_bact <- NEON_MAGs %>%
filter(Domain=="Bacteria")
NEON_MAGs_bact_ind <- NEON_MAGs %>%
filter(Domain=="Bacteria") %>%
filter(`Assembly Type`=="Individual")
NEON_MAGs_bact_co <- NEON_MAGs %>%
filter(Domain=="Bacteria") %>%
filter(`Assembly Type`=="Combined")
NEON_MAGs_bact_ind_Novel <- NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) )
NEON_MAGs_bact_co_Novel <- NEON_MAGs_bact_co %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) )
NEON_MAGs_metagenomes_chemistry_TOOL_Xanthomonadales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Xanthomonadales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Burkholderiales"))
NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales <- NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(str_detect(`Order`,"Steroidobacterales"))
NEON_MAGs_metagenomes_chemistry_Alaska <-NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(Site.x,"Alaska"))
# Make a vector with the internal node labels
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
# Search for your Phylum, dont sort differently it will mess up nodes
#NEON
phylumss <-NEON_MAGs_metagenomes_chemistry %>%
count(Phylum, sort=TRUE)
n=1
while (n!=29) {
if (length(grep(phylumss[n,1], node_vector_bac, value = TRUE))==2) {
phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[2]
}
else {
phylumss[n,3] <-match(grep(phylumss[n,1], node_vector_bac, value = TRUE), node_vector_bac)[1]
}
n=n+1
}
# for some reason they didnt name phylum subpopulations the same way for each, so we have to correct for Desulfobacterota
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
phylumss[16,3] <- match(grep(phylumss[16,1], node_vector_bac, value = TRUE), node_vector_bac)[1]
phylumss <-phylumss %>%
arrange(desc(`...3`))
colortest <-viridis(29)
n=1
while (n!=29) {
phylumss[n,4] <- colortest[n]
n=n+1
}
#Gamma
tree_bac_node_Gammaproteobacteria <- Preorder(tree_bac)
tree_Gammaproteobacteria <- Subtree(tree_bac_node_Gammaproteobacteria, 3048)
# grep("Thermoproteota", node_vector_bac, value = TRUE)
# match(grep("Thermoproteota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Actinomycetota", node_vector_bac, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Desulfobacterota", node_vector_bac, value = TRUE)
# match(grep("Desulfobacterota", node_vector_bac, value = TRUE), node_vector_bac)
# match(grep("Desulfobacterota_B", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Bacteroidota", node_vector_bac, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Verrucomicrobiota", node_vector_bac, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Chloroflexota", node_vector_bac, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Eremiobacterota", node_vector_bac, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Patescibacteria", node_vector_bac, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac, value = TRUE), node_vector_bac)
#
# grep("Pseudomonadota", node_vector_bac, value = TRUE)
# match(grep("Pseudomonadota", node_vector_bac, value = TRUE), node_vector_bac)
# grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Phycisphaerae", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Acidobacteriota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Actinomycetota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Myxococcota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Bacteroidota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Verrucomicrobiota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Chloroflexota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Eremiobacterota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
#
# grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Patescibacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
# match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
# theme_classic(axis.text.x = element_text(color="grey20", size = 12,angle = 90, hjust = 0.5, vjust = 0.5),
# axis.text.y = element_text(color = "grey20", size = 12), text=element_text(size = 16))
John_theme <-theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black", face='italic'),axis.title.x=element_text(size = 15),
axis.text.y=element_text(color="black"),axis.title.y=element_text(size = 15))
As global temperatures rise due to industrial pollution; it is important to monitor the ecosystems humanity depends on for industrial and agriculture production to identify beneficial agents like nitrogen fixating bacteria, as well as harmful one like phytopathogens. The soil bacterium MAGs collected and annotated in GOLD Study ID Gs0161344 represent a poll of soil bacterium populations across the United States. This data can be used to track bacterial population changes across ecosystem subtypes and help identify bacteria with climate change resistant properties. The MAGs collected in this study are a treasure trove of genomic data and may help identify climate change resistant and nitrogen fixating bacteria.
Carbon emissions from industrial activity has led to numerous changes to the global climate that threaten the ecosystems humanity depends on for industrial agriculture. Rising temperatures has caused the melting of glaciers and permafrost releasing bacterial species that have been dormant for millennium (Wu et al. 2022). Additionally, The higher green house gas atmospheric content has lead to the acidification of the soil (Ferdush et al. 2023) The changing climate has also lead to species migration. With the recent passing of the 1.5 Celsius average global temperature milestone set in Paris Agreement of 2015, it is imperative that society adapt to our changing world (McCulloch et al. 2024).
In the face of climate and antibiotic challenges, plants have developed symbiotic relationships with bacteria. For example, members of the Burkholderia family have been shown to help fix nitrogen in the rhizosphere of corn plants, promoting their growth. Thus, it has been proposed that humanity’s crops could be better insulated to ecological changes by employing such bacteria in our fertilizer (Su et al. 2024). While several beneficial bacterial species have identified, the vast majority of the bacterial kingdom remains sequenced. Additionally, with their ability to rapidly evolve in the face of ecological challenges, new species with more robust tolerances to climate change influences will likely only grow with time. Thus, soil bacterium represent a vast untapped resource of climate change resistant proteins and nitrogen fixators. Data collection effort by organizations like the National Ecological Observatory Network, provide a valuable genomic resource for phylogenetic analyses to determine the identities of potential beneficial bacteria as well as monitoring the population changes caused by a changing climate.
This study’s genomic data set was collected from soil samples by the National Ecological Observatory Network (NEON) from locations across the United States in GOLD Study ID Gs0161344. There were 1710 total bacterial MAGs with 22% being individual or combined assemblies of novel species of bacteria. To make analyses more feasible, this report will only comment on two data subsets, MAGs belonging to the class Gammaproteobacteria, and MAGs belonging found at Toolik Field Station, Alaska USA.
The class Gammaproteobacteria, under the phylum Pseudomondata, is made up of around 381 genera that thrive in marine, terrestial, and eukaryotic host ecosystems (Liao et al. 2020). Historically, this class has be defined phylogenetically by 16s rRNA sequence homology (Williams and Kelly 2013). Some notable members of this class include Escherichia coli, Vibrio fischeri, Pseudomonas aeruginosa, Pseudomonas putida , and Xanthomonas axonopodis pv. citri (Webster, Bogema, and Chapman 2020). This class has great diversity of morphologies with rod, cocci, spirilla, and filaments all represented (Williams et al. 2010). Additionally, species in class display a variety of trophisms including chemoautotrophs and photoautotrophs (Gao, Mohan, and Gupta 2009).
Located 400 miles north from Fairbanks, Alaska at the foot of the Brooks mountain range, biodiversity at Toolik Field Station is heavily influenced by its harsh winters where temperatures can reach -50⁰F. It is home to a variety of fauna including caribou, loons, voles, and polar bears. Located above the northern tree line, the vegetation in the tundra here mainly consists of birch, willow, sedges and grass. The site contains a large range of soil conditions, including layers of permafrost, created by glacial action (NEON 2023).
This study examines the genomic content and environmental conditions of bacteria found at the Toolik Field station to help establish a reference population for future comparisons of bacterial population changes.
Microbial samples analyzed in this study were collected as part of GOLD Study ID Gs0161344 from soil samples taken from NEON observation sites across the United States and sequenced via high throughput Illumina sequencing (Mukherjee et al. 2022). Sequence results were then processed and annotated by the DOE JGI Metagenome Workflow for its inclusion in the Integrated Microbial Genomes and Microbiomes (IGM/M) Database and Joint Genomic Institute ’s Genomes Online Database (JGI GOLD) (Clum et al. n.d.). Briefly this workflow consists of the following steps: (1) Assembly of contigs and read alignment to assembled contigs. Contigs are additionally processed for quality control. (2) Feature prediction of coding and non-coding genes, as well as CRISPR sequences. (3) Functional annotation, in which predicted features are assigned identifiers based on sequence similarity. (4) Taxonomic annotation in which contig-level phylogenetic assignments are made based on functional annotations. (5) Binning by high- and medium-quality genome bins. Bins are additionally screened for contamination. A detailed explanation of the workflow can be found in Clum et al., ASM mSystems, 2021.
The figures of this study were formatted with the following packages in R: tidyverse,knitr, ggtree, TDbook, ggimage, rphylopic, treeio, tidytree, ape, TreeTools, phytools, ggnewscale, ggtreeExtra, ggstar, DT
plottttttt <-ggtree(tree_bac, layout="circular", branch.length="none")
n=1
while (n!=29) {
if (is.na(phylumss[n,3])) {
} else {
plottttttt <-plottttttt + geom_cladelab(node=as.integer(phylumss[n,3]), label=as.character(phylumss[n,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[n,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[n,4]))+geom_hilight(node=as.integer(phylumss[n,3]), fill=as.character(phylumss[n,4], alpha=.6))
}
n=n+1
}
plottttttt
Figure 3: Phylogenetic Tree of all bacterial
MAGs. Tree constructed from samples collected in GOLD Study ID
Gs0161344 by the National Ecological Observatory Network.
### Figure 4:
knitr::include_url("data/lab14/sankey-NEON_MAG_ind_pavian.txt.html")
Figure 4: Sankey plot of all NEON Individual assembly MAGs.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=`Site ID`))+geom_bar(position="dodge")+coord_flip()+labs(x="Phylum", y="NEON MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'),axis.title.y=element_text(size = 15), legend.title=element_text(size=15), legend.text=element_text(size=10))+ scale_y_continuous(limits=c(0,100),breaks = c(0,10,20,30,40,50,60,70,80,90,100))
Figure 5: NEON MAG distribution by
phylum. NEON site distribution of MAGs collected in GOLD Study
ID Gs0161344 organized by phylum.
The three phyla with the lionshare of bacteria found in this study were Actinomycetota, Pseusdomonadota, and Acidobacteriota (Fig. 3,4,5). National Grasslands LBJ, Texas USA (CLBJ) accounted for the greatest portion of bacteria(Fig. 5). A majority of phyla found in this study contained MAGs collected in at least two locations but quite a few of phyla with few MAGs were found in a single location. For example, MAGs belonging to the Desulfobacterota, Myxococcota_A, Eisenbacteria, Krumholzibacteriota, and Nitrospirota phyla were only found in Chase Lake Wetlands, North Dakota, USA (Fig. 5).
NEON_MAGs_bact_ind_Novel %>%
ggplot(aes(x=fct_rev(fct_infreq(`Site ID`)), fill=`Site ID`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Site ID", y="Total Novel Bacteria (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"))
Figure 6: Novel Bacteria MAG NEON Site
Distribution. Novel Bacteria were determined from MAGs
constructed from individual assemblies. Novel indicates the MAGs could
not be placed in an existing group at the species, genus or family
level.
### Figure 7:
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_rev(`Phylum`), fill=`Phylum`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(x="Phylum", y="Novel Species MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'))+scale_y_continuous(limits = c(0,150), breaks = c(0,10,20,30,40,50,60,70,80,90,100,110,120,130,140,150))
Figure 7: Novel bacteria predominately found in
the phylum Actinomycetota. Novel
bacteria were determined from MAGs constructed from individual
assemblies in GOLD Study ID Gs0161344. Novel indicates the MAGs could
not be placed in an existing group at the species, genus or family
level.
There were 243 and 129 novel species of bacteria annotated in GOLD Study ID Gs0161344’s individual and combined assembly MAGs, respectively. The majority of the novel individual assembly MAGs were found in the phylum Actinomycetota (Fig. 7). Novel individual assembly MAGs were spread out over all NEON collection sites with the top three sites for novel individual assemblies being National Grasslands LBJ, Texas USA (CLBJ), Great Basin, Onaqui, Utah USA (ONAQ), Konza Prairie Bio Station, Kansas USA (KONZ) (Fig.6)
NEON_MAGs_metagenomes_chemistry_Bacteria <-NEON_MAGs_metagenomes_chemistry %>%
filter(Domain=="Bacteria") %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_Bacteria %>%
ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Phylum", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Bacteria, aes(x=`Gene Count`, y=`Genome Size (Kbp)`, color=`Phylum`))+geom_point()+labs(title="B", x="Gene Count (n)", y="")+scale_y_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+scale_x_continuous(limits=c(0,15000), breaks=c(0,2500,5000,7500,10000,12500,15000))+theme_classic()+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black"), legend.title=element_text(size=15),legend.text=element_text(size=10),title=element_text(size=20))
Figure 8: Overall linear trend in gene count vs
genome size for MAGs found across NEON sites. A) Boxplot of MAG
genome size (kbp) for all bacterial phyla. B) Scatter plot of MAG Gene
count vs genome size (kbp) for all bacterial MAGs. MAGs are colored by
phylum. All samples found in GOLD Study ID Gs0161344.
Terrestrial bacteria are known to have large genomes encoding thousands genes. This is due in larger part to the diverse environment they are exposed to. Their larger genomes allow for the expression of multiple metabolic phenotypes that allow them to adapt to environmental challenges. NEON samples analyzed in this study had a broad spread of genome sizes with the minimum genome and maximum genomes sizes being 753 from the phylum Chloroflexota and 12,584 kbp from the phylum Actinomycetota, respectively (Fig. 8). There was a linear relationship between gene count and genome for all NEON samples, with a rough 1,000 bp per gene ratio (Fig. 8B).
ggtree(tree_bac, layout="circular", branch.length="none") +
geom_hilight(node=as.integer(phylumss[1,3]), fill=as.character(phylumss[1,4], alpha=.6)) +
geom_cladelab(node=as.integer(phylumss[1,3]), label=as.character(phylumss[1,1]),size=10, align=TRUE, angle='auto', offset.text=1, textcolor=phylumss[1,4] ,barsize=1.5, fontsize=5, barcolor=as.character(phylumss[1,4]))+
geom_hilight(node=3048, fill="steelblue", alpha=.6) +
geom_cladelab(node=3048, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=1,
offset.text=0.5 , textcolor='steelblue', barcolor='steelblue',barsize=1.5, fontsize=5)
Figure 9: Phylogenetic tree of all bacterial
MAGs. Tree contails all bacterial samples collected in GOLD
Study ID Gs0161344 by the National Ecological Observatory Network with
the Gammaproteobacteria class of Pseudomondata
highlighted in blue.
knitr::include_url("data/lab14/sankey-NEON_MAG_ind_Gpro(2).html")
Figure 10: Sankey plot of individual assembly Gammmaproteobacteria MAGs All samples found in GOLD Study ID Gs0161344.
Figure 11: Phylogenetic Tree of
Gammaproteobacteria. Marker size is based on
total number of bases in MAG. Markers colored by ecosystem subtype.Tree
includes MAGs in GOLD Study ID Gs0161344 filtered to those annotated as
belonging to the class Gammaproteobacteria.
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
ggplot(aes(x=Family))+geom_bar(aes(fill=Family),position=position_dodge2(width=0.9, preserve="single"),show.legend=FALSE)+coord_flip()+facet_wrap(vars(Order), scales="free_y", ncol=4)+labs(x="Family", y="MAGs (n)")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title.x=element_text(size = 15), axis.text.y=element_text(color="black",face = 'italic'), legend.title=element_text(size=15),legend.text=element_text(size=10),strip.text = element_text(size=15, face = "italic"))+scale_y_continuous(limits=c(0,75), breaks=c(0,15,30,45,60))
Figure 12: Distribution of
Gammaproteobacteria MAGs by order. Barplot of
Gammaproteobacteria families organized into panels by order.
Barplot includes MAG reads from all NEON sites in GOLD Study ID
Gs0161344 filtered to those annotated as belonging to the class
Gammaproteobacteria.
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
ggplot(aes(x=`Order`,y=`Genome Size (Kbp)`,color=`Order`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Order", y="Genome Size (Kbp)")+John_theme
Figure 13: Genome Size (kbp) of
Gammaproteobacteria Members between 2500-5000 Kbp with large
variation in Burkholderiales. Boxplots includes MAG
reads from all NEON sites filtered to those annotated as belonging to
the class Gammaproteobacteria.
NEON MAGs assigned to the Gammaproteobacteria class were
found in all ecosystem subtypes (Fig. 11). Unlike the larger NEON data
set, the distribution of genome size of Gammaproteobacteria
members was fairly narrow with members averaging between 2000 and 5000
kbp (Fig. 13). With the exception of
Burkholderiales,Steroidobacterales, and
Xanthomonadales, this was largely due to the fewer MAGs in each
order(Fig.12). The vast majority of Gammaproteobacteria
annotated in this study were novel species with only two bacteria
belonging to the Xanthomonadales order assigned as
Stenotrophomonas stenotrophomonas sp024519465.
Steroidobacterales had by far the most annontated members,
while Burkholderiales had the most family member groups (Fig.
10,12).
Figure 14: Ecological Conditions of
Gammaproteobacteria Samples. A) Scatterplot of sample soil
temperature vs Gammaproteobacteria families found in GOLD Study
ID Gs0161344. Points are colored by order group. B) Scatterplot of
sample soil pH in water vs Gammaproteobacteria families found
in GOLD Study ID Gs0161344. Points are colored by order group. C)
Scatterplot of sample National Land Cover Database Vegetation Type vs
Gammaproteobacteria families found in GOLD Study ID Gs0161344.
Points are colored by order group. D) Scatterplot of sample ecosystem
subtype vs Gammaproteobacteria families found in GOLD Study ID
Gs0161344. Points are colored by order group.
Members of Gammaproteobacteria were found in a variety of ecosystem conditions. Gammabacteria were found in temperatures spanning 2-28 Celsius (Fig. 14A). The soil pH of all Gammaproteobacteria samples was largely neutral to mildly acidic with lowest pH around 4 (Fig. 14B). Overall, the vegetation class of sedge and grassland herbaceous and contained the least amount of family groups in Gammproteobacteria (Fig. 14C). This is not too surprising for sedge herbaceous as this vegetation class is only found in one Alaskan NEON site (Sup. Fig. 1). However, grassland herbaceous vegetation is present in 5 out of the 13 sample sites (Sup. Fig 2). Interestingly, despite being the Gammaproteobacteria order with the most MAGs and existing in a variety of vegetation classes, soil pH and temperatures, no members of the Steroidobacterales order were found in tropical forest or desert ecosystems sampled in this study(Fig. 14). These ecosystem subtypes correspond to NEON sites in Puerto Rico and Arizona, USA, respectively (Sup.Fig 2).
NEON_MAGs_metagenomes_chemistry_Steroidobacterales <-NEON_MAGs_metagenomes_chemistry_Steroidobacterales %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
#Ecosubtype
NEON_MAGs_metagenomes_chemistry_Steroidobacterales %>%
ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#siteID
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus, `Site ID.x`,color=Genus))+geom_point(show.legend=FALSE)+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20))+
#genomesize
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (kbp)")+John_theme+theme(title=element_text(size=20))+
#genus count
ggplot(data=NEON_MAGs_metagenomes_chemistry_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+scale_y_continuous(limits=c(0,50), breaks=c(0,10,20,30,40,50))+John_theme+theme(title=element_text(size=20))
Figure 15:
Steroidobacteraceae members are found
in a variet of ecosystems but are overwhelmimg represented by the
genus Bog-1198. A) Dotplot of
ecosystem subtype each genus of Steroidobacteraceae was found
in. B) Dotplot of the NEON sites each genus of
Steroidobacteraceae was found in. C) Boxplot of MAG genome
sizes (kbp) for Steroidobacteraceae genera. D) Barplot of
Steroidobacteraceae genera MAGs. All samples belong to the GOLD
Study ID Gs0161344.
### Figure 16:
Figure 16: Steroidobacteraceae
BOG-1198 is found predominately in the northern
USA A) Barchart of BOG-1198 MAGs found in NEON
ecosystem subtypes. B) Barchart of BOG-1198 MAGs found at NEON
sites. All samples belong to the GOLD Study ID Gs0161344.
Members in the order Steriobacterales were further examined to see if their high population is correlated with genome size or sample location. Only the Steroidobacteraceae family was found under the order. This family contain 7 genera with the genus BOG-1198 accounting for 30 of the 50 of Steriobacterales MAGs (figs. 15A). Once again the distribution of genome size appears correlated to the total MAG counts for each genera in the Steroidobacteraceae family, with the higher populations corresponding to larger deviation in genome size (Fig. 15C,D). Given that BOG-1198* accounted for a majority of Steriobacterales MAGs, the genus was further examined. Members of this genus were found in several ecosystem subtypes and sample sites located in the northern United States (Fig. 16). The majority individual Assemblies of Steroidobacteraceae BOG-1198 MAGs were found at one of the three Alaskan sample sites (Fig. 16B). Note that all combined assembly MAGs were given the shrubland ecosystem subtype. A third of all BOG-1198 MAGs were combined assemblies.
NEON_MAGs_metagenomes_chemistry_Burkholderiales <-NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000))
NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
ggplot(aes(x=Genus, y=`Ecosystem Subtype`,color=`Family`))+geom_point(show.legend=FALSE)+labs(title="A", x="", y="Ecosystem Subtype")+theme_classic()+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,y= `Site ID.x`,color=`Family`))+geom_point()+labs(title="B", x="", y="Site ID")+theme_classic()+John_theme+theme(title=element_text(size=20), legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales, aes(x=`Genus`,y=`Genome Size (Kbp)`,color=`Family`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="C", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title=element_text(size=20))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiales,aes(x=Genus,fill=`Family`))+geom_bar()+labs(title="D", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))
Figure 17: Burkholderiales
members are found in a variety of ecosystems and have a broad
distribution of genome size. A) Scatter plot of sample
ecosystem subtype vs Burkholderiales genera. P B) Scatter plot
of NEON site ID vs Burkholderiales genera. C) Box plot of
Burkholderiales genera genome sizes (kbp) D) Bar plot of
Burkholderiales genera MAGs. All plots colored by family group.
All samples from GOLD Study ID Gs0161344.
### Figure 18:
NEON_MAGs_metagenomes_chemistry_Burkholderiaceae <- NEON_MAGs_metagenomes_chemistry_Burkholderiales %>%
filter(str_detect(Family,"Burkholderiaceae" ))
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Ecosystem Subtype`,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="A", x="Ecosystem Subtype", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),title=element_text(size=20), axis.text.y=element_text(color="black"), axis.text.x=element_text(angle=45, vjust=1, hjust=1, color="black"))+
ggplot(data=NEON_MAGs_metagenomes_chemistry_Burkholderiaceae, aes(x=`Site ID.x`, fill=Genus))+geom_bar()+labs(title="B", x="Site ID", y="MAGs (n)")+theme_classic()+theme(axis.title=element_text(size = 15),axis.text.y=element_text(color="black"), axis.text.x=element_text(color="black", angle=45, vjust=1, hjust=1), title=element_text(size=20),legend.text = element_text(size=10, face='italic'),legend.title = element_text(size=15))
Figure 18: Genera of
Burkholderiaceae are found in a
variety of ecosystems and NEON locations. A) Barplot of
Burkholderiaceae genera MAGs found in NEON ecosystem
subtypes. B) Barplot of Burkholderiaceae
genera MAGs found at NEON sites. Site labeled NA is for combined
assembly MAGS. Bar colored by genus group. Bars colored as NA represent
novel genera in Burkholderiales. All samples from GOLD Study ID
Gs0161344.
Members of the order Burkholderiales were also examined further at the genus level to determine if their broader diversity corresponded to the variety of ecosystems they were found in. Indeed, members of this order were found across the United States in several different ecosystem subtypes(Fig. 17A,B). Individually assembled members of the Burkholderiaceae including the genera Caballeronia, Herbaspirillum, and Paraburkholderi, were found mainly temperate forests with some members also found in tundra and and Boreal forest/Taiga subtypes (Fig. 17, 18A). Interestingly, genera genome size distribution appears not to be correlated to the amount genera members. Despite only having 4 MAGs from the Niwot Ridge site in Colorado the Herbaspirillum genera contained a broad range of genome sizes (fig. 17C). The Caballeronia genera, containing 5 MAGs, had a much tighter distribution of genome size (Fig. 17C). Also of note is the appearance of the Trinicki genera in the Wind River Experimental Forest in Washington (Fig. 17,18).
knitr::include_url("data/lab14/sankey-NEON_MAG_Toolik.html")
Figure 19: Sankey plot of all MAGs from Toolik Field Station, Alaska USA. Samples collected in GOLD Study ID Gs0161344.
ggtree(tree_bac_TOOL_MAGs, layout="circular", branch.length="none") +
geom_hilight(node=258, fill="grey", alpha=.6) +
geom_cladelab(node=258, label="Pseudomonadota", align=TRUE, angle='auto',
offset.text=0.5 , textcolor='black', barcolor='grey',barsize=1.5, fontsize=5)+
geom_hilight(node=259, fill="steelblue", alpha=.6) +
geom_cladelab(node=259, label="Gammaproteobacteria", align=TRUE, angle='auto', offset=0.75,
offset.text=0.5 , textcolor='black', barcolor='steelblue',barsize=1.5, fontsize=5)
Figure 20: Phylogenetic tree of all MAGs
collected at Toolik Field Station, Alaska USA.
Gammaproteobacteria class of Pseudomondata highlighted
in blue. Samples collected in GOLD Study ID Gs0161344 by the National
Ecological Observatory Network at Toolik Field Station, Alaska USA.
Figure 21: MAGs found at Toolik Field Station
dominated by Actinomycetota
and Pseudomondota
and Acidobacteriota
members. Stacked barplot of bacterial Phlya found at
Toolik Field Station, Alaska USA in GOLD Study ID Gs0161344. Bars are
colored by Order groups.
The majority of MAGs collected at Toolik Field Station belonged to the Actinomycelota and Pseudomonadota phyla (Fig. 20,21). Inside Actinomycelota, MAGs belonging to the order Thermoleophilia dominated (Fig. 21). The phylum* Pseudomonadota* was almost equally divided into Alpha and Gamma Proteobacteria (Fig. 21). The phylum Acidobacteriota was also frequently found at Toolik, with the vast majority of its MAGs belonging to the order Terriglobia (Fig. 21).
Figure 22: Toolik Field Station has some of the
lowest sample temperatures but average sample pHs compared to other NEON
sites. A) Scatterplot of sample soil temperatures at each NEON
site. B) Scatterplot of sample soil pHs in water at each NEON site.
Points colored by Site ID. All samples from GOLD Study ID Gs0161344.
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`soilTemp`, color=`Phylum`), size=1)+geom_point(show.legend=FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,10)) +labs(y="Soil Temperature (Celsius)", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
Figure 23: Similar soil temperature and phyla
found at Alaskan NEON Sites Scatter plot of sample temperature
in Celsius vs phyla at Alaskan NEON sites in GOLD Study ID Gs0161344.
Left Panel: Caribou Creek Watershed, Alaska USA. Middle Panel: Healy,
Denali National Park, Alaska USA. Right Panel: Toolik Field Station,
Alaska USA. Points are colored by Phylum.
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`soilInWaterpH`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+scale_y_continuous(limits=c(0,14),breaks =rep(0:14)) +labs(y="Soil pH", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=13))
Figure 24: Similar soil pH and phyla found at
Alaskan NEON Sites Scatter plot of sample soil pH in water vs
phyla at Alaskan NEON sites in GOLD Study ID Gs0161344. Left Panel:
Caribou Creek Watershed, Alaska USA. Middle Panel: Healy, Denali
National Park, Alaska USA. Right Panel: Toolik Field Station, Alaska
USA. Points are colored by Phylum.
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`,y=`nlcdClass`, color=`Phylum`), size=1)+geom_point(show.legend = FALSE)+facet_wrap(vars(Site.x), scales="free_y", ncol=3)+labs(y="Vegetation Class", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10))
Figure 25: Vegetation Class differs but similar
phyla found at Alaskan NEON Sites Scatter plot of National Land
Class Vegetation Class vs Phyla at Alaskan NEON Sites in GOLD Study ID
Gs0161344. Left Panel: Caribou Creek Watershed, Alaska USA. Middle
Panel: Healy, Denali National Park, Alaska USA. Right Panel: Toolik
Field Station, Alaska USA. Points are colored by Phylum.
NEON_MAGs_metagenomes_chemistry_Alaska %>%
ggplot(aes(x=`Phylum`, fill=`Order`))+geom_bar()+facet_wrap(vars(Site.x), scales="free_y", ncol=3) +labs(y="MAGs (n)", x="Phylum")+theme_classic()+John_theme+theme(strip.text = element_text(size=10), legend.title=element_text(size=15), legend.text=element_text(size=10, face='italic'))
Figure 26: Phyla MAGs distribution differs at
each Alaskan NEON Site. Stacked barplot of Phyla at Alaskan
NEON Sites in GOLD Study ID Gs0161344 . Left Panel: Caribou Creek
Watershed, Alaska USA. Middle Panel: Healy, Denali National Park, Alaska
USA. Right Panel: Toolik Field Station, Alaska USA. Bars are colored by
Order group.
Given the significant distance between the three Alaskan sites and the rest of the NEON collection sites, the ecological conditions of all Alaskan MAGs were first examined together to see if phylogenetic distribution was more correlated with ecological conditions than geographic position in North America. As expected, the three Alaskan sites had the lowest sample temperatures recorded in this study (Fig. 22A). Similarly, their soil pH was among the lowest in the study (Fig. 22B). However, when examining their phyla side by side at these conditions it became clear at the phyla level that the three sites had similar temperatures, pH, and phyla distribution (Fig 23,24). The Alaskan sites did vary in vegetation class with Toolik consisting of three scrub types, Healy of just dwarf Scrub, and Caribou Creek of forest and scrub. Toolik Field station had the most MAGs and phyla represented followed by Healy (Fig. 26), indicating that scrub vegetation classes may host greater bacterial populations in Alaska than forest.
### Figure 27:
NEON_MAGs_metagenomes_chemistry_TOOL %>%
mutate(`Genome Size (Kbp)`=as.integer(`Total Number of Bases`/1000)) %>%
ggplot(aes(x=`Phylum`,y=`Genome Size (Kbp)`,color=`Phylum`))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(x="Phylum", y="Genome Size (Kbp)")+John_theme
Figure 27: Genome Size Distributions at Toolik
Field Station are inline with those of the wider Gold Study.
Boxplot of Phyla genome sizes (kbp) found at Toolik Field Station,
Alaska USA in GOLD Study ID Gs0161344.
ggtree(tree_bac_TOOL_MAGs) %<+%
NEON_MAGs_metagenomes_chemistry +
geom_tippoint(aes(colour=`Phylum`)) +
# For unknown reasons the following does not like blank spaces in the names
geom_facet(panel = "Gene Count (n)", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_point,
mapping=aes(x = GeneCount, color=Phyla))+
geom_facet(panel = "% GC Content ", data = NEON_MAGs_metagenomes_chemistry_TOOL_noblank, geom = geom_col,
aes(x = GCassembled,fill=Phyla), orientation = 'y', width = .6, show.legend=FALSE) +
theme_tree2(legend.position=c(.1, .7),strip.text=element_text(size=15),axis.text=element_text(color = 'black'))
Figure 28: Genome content of bacteria found at
Toolik Field Station. Left Panel: Phylogenetic tree of all MAGs
found at Toolik Field Station, Alaska USA with markers for MAG phylum.
Middle Panel: Gene Count of all Mags found at Toolik Field Station,
Alaska USA. Points colored by MAG phylum. Right Panel: % GC Content of
all MAGs found at Toolik Field Station, Alaska USA. Bar colored by MAG
phylum. All samples from in GOLD Study ID Gs0161344.
The genome size of MAGs collected at Toolik Field Station reflected those of the overall GOLD study with the trend of greater representation correlating with greater variation in phyla genome size (Fig. 27,28). This was especially evident for the phyla Patescibacteria, Armatimonadota, Gemmatimonadota and Verrucomicrobiota which had low representation in Toolik MAGs (Fig.28). The genomes at Toolik had a spread of gene counts with Chloroflexota and Pseudomonadota species at the low and high end of the spectrum, respectively (Fig.28). This variation was not reflected in genome GC content with the average %GC around 60% (Fig.28). ### Figure 29:
NEON_MAGs_metagenomes_chemistry_TOOL %>%
filter(Class=="Gammaproteobacteria") %>%
ggplot(aes(x=fct_rev(fct_infreq(Order)), fill=Family))+geom_bar()+coord_flip()+labs(x="Order", y="MAG Count (n)",fill="Family")+theme_classic()+theme(axis.text.x=element_text(color="black"),axis.title=element_text(size = 15), axis.text.y=element_text(color="black",face='italic'), legend.title=element_text(size=15),legend.text=element_text(size=10))+scale_y_continuous(limits=c(0,50),breaks = c(0,10,20,30,40,50))
Figure 29: Gammabacteria
at Toolik Field Station fall into
Steroidobacterales or
Burkholderiales families.
Stacked Barplot of Gammaproteobacteria MAGs found at Toolik
Field Station, Alaska USA in GOLD Study ID Gs0161344. Bars colored by
family group.
NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales %>%
ggplot( aes(x=`Genus`,y=`Genome Size (Kbp)`,color=Genus))+geom_boxplot(show.legend = FALSE)+scale_y_continuous(limits=c(0,15000), breaks=c(2500,5000,7500,10000,12500,15000))+theme_classic()+labs(title="A", x="Genus", y="Genome Size (Kbp)")+John_theme+theme(title = element_text(size=15))+
#genus count
ggplot(data=NEON_MAGs_metagenomes_chemistry_TOOL_Steroidobacterales,aes(x=Genus,fill=Genus))+geom_bar(show.legend=FALSE)+labs(title="B", x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(title = element_text(size=15))
Figure 30: Distribution of
Steroidobacterales genera at Toolik
Field Station, Alaska USA. A) Boxplot of sample genome size
(kbp) in the Steroidobacterales family at Toolik Field Station,
Alaska USA. B) Barplot of MAGs for each genus in the
Steroidobacterales family found at Toolik Field Station, Alaska
USA. All samples from GOLD Study ID Gs0161344
NEON_MAGs_metagenomes_chemistry_TOOL_Burkholderiales %>%
ggplot(aes(x=Genus,fill=`Family`))+geom_bar()+labs(x="Genus", y="MAGs (n)")+theme_classic()+John_theme+theme(legend.title = element_text(size=15),legend.text = element_text(size=10, face='italic'))+scale_y_continuous(limits=c(0,5),breaks=rep(0:5))
Figure 31: Genus distribution of
Burkholderiales is wide spread at
Toolik Field Station, Alaska USA. Barplot of MAGs in
Burkholderiales Genera at Toolik Field Station, Alaska USA.
Bars colored by family group. NA represent novel genera in
Burkholderiales. All samples from GOLD Study ID Gs0161344.
Only two orders of Gammaproteobacteria were found at Toolik Field Station. Similar to the overall GOLD study, the order Steroidobacterales had the most MAGs with the majority of them belonging to the BOG-1198 genus (Fig. 29,30). The order Burkholderiales had 6 families found at Toolik Field Station (Fig. 31). The genera Caballeronia, Herbaspirillum, Paraburkholderi, and Trinicki were not found at Toolik (Fig. 18B, 31).
The MAG data collected and annotated in GOLD Study ID Gs0161344 represents a treasure trove of environmental and genomic information that can help researchers track climate and bacterial population changes across the United States. This study collected MAGs from 24 phyla and discovered over 300 novel species of bacteria. The major contributors of this data set, Actinomycelota, Pseudomonadota, and Acidobacteriota, were found at the majority of sample sites as the highest represented phyla (Fig. 5). This study also had great representation of the ecosystem subtypes and soil temperatures seen in the United States. Furthermore, National Grasslands LBJ site in Texas, appears to be a particular rich site for bacterial species. This dataset will provide a good baseline for tracking population changes due to changing climate changes and help identify beneficial bacterium for agricultural production. There may be a trend to more acidic, hotter soils with increased carbon emissions in the atmosphere.
As expected for a class with 381 genera, Gammaprotobacteria were found in every ecosystem subtype sampled in this study. Members like those in the order Steroidobacterales were found at nearly every soil temperature and pH sampled, indicating this class’s diversity allows for members to survive in a variety of environments.
Given their wide distribution across environment types, members of the Steroidobacterales order warrant further investigation as potential bacterial sources of climate change resistant proteins. Despite their bog naming convention, Steroidobacteraceae BOG-1198 had the largest representation in this order across multiple ecosystem subtypes, indicating this genus may contain members particularly adept at surviving a variety of environmental challenges. Additionally, the order’s underrepresentation in tropical forest and desert ecosystems is quite curious. These ecosystem subtypes are quite ecologically different from each other, but their unifying higher soil temperatures do not seem to be an issue for Steroidobacterales members at other NEON sites. Given that the tropical forest ecosystem is only found at the Puerto Rico NEON site, the absence of Steroidobacterales at that location may be related to its geographic isolation as an island.
Several notable genera in Burkholderiales relevant to bacterial biocontrol were found in this study, Caballeronia, Paraburkholderia, Trinickia. The genera Caballeronia and Paraburkhoderia have been reported to contain nitrogen fixating bacteria that may be useful for improving crop yields (Estrada-de los Santos et al. 2018). Agricultural nitrogen sources usually come in the form of chemical fertilizer that can be washed into the local watershed by precipitation. Bacteria like those in the Caballeronia and Paraburkholderia genera, are being investigated for the possibility of increasing crop nitrogen resources through the augmentation of traditional fertilizing methods with their introduction to crop rhizospheres. However, these two genera were found in Caribou Creek Alaska and Yellowstone, Wyoming. Temperatures at these locations range from temperate to cold. While this study’s Caballeronia and Paraburkholderia MAGs may contain species useful for nitrogen fixation, they may also be vulnerable to increases soil temperature caused by climate change. The genera Trinickia has been reported to contain both plant growth promoting and pathogenic bacteria species (Estrada-de los Santos et al. (2018)). More investigation into the Trinickia population at Wind River, Washington in needed to determine if this study’s MAGs are potentially agriculturally beneficial.
MAGs collected at Toolik Field Station in this study largely did not differ phylogenetically from those collected at the other Alaskan sites. All three sites had similar soil temperatures and pH but differed in vegetation classes. This difference may have contributed to the higher overall MAG totals at Toolik. Those MAGs were dominated by those from the Actinomycelota and Pseudomonadota phyla. Notably, Toolik differed from the other Alaskan sites with a large population of Acidobacteriota MAGs (Fig. 27). Interesting, the **Acidobacteriota* * MAGs found at Toolik had a fairly tight distribution of total genes and genome size, especially compared to Actinomycelota and Pseudomonadota (Fig. 27,28). This may indicate that this phylum’s genome is more conserved at Toolik than the other two dominate phyla. More sampling of bacteria from similar ecosystems in outside the United States may help prove if the phylogenetic distribution seen at Alaskan sites can be attributed to ecosystem subtype or geographic latitudinal differences.
Disappointingly, no genera known for their biocontrol properties were found at Toolik Field Station. However, as temperatures and climates fluctuate with rising global temperatures it will be interesting to see if the Caballeronia, Paraburkholderia, and Trinickia genera found in Caribou Creek Alaska, Yellowstone, Wyoming, and Wind River, Washington begin to appear in the thawing permafrost at Toolik. Given the genus Steroidobacteraceae BOG-1198’s ability to survive in more temperate climates, it would not be surprising if its representation at Toolik increased in future studies.
datatable(NEON_MAGs_metagenomes_chemistry)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
datatable(NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria)
datatable(NEON_MAGs_metagenomes_chemistry_TOOL)
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(y=`Site.x`, x=nlcdClass))+geom_point()+labs(title="NEON Site Vegetation Classes", y="Site", x="Vegetation Class")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(y=`Site.x`, x=`Ecosystem Subtype`))+geom_point()+labs(title="NEON Site Ecosystem Subtypes", y="Site", x="Ecosystem Subtype")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
NEON_MAGs_metagenomes_chemistry %>%
ggplot(aes(x=`Site ID.x`,y=Site.x))+geom_point()+labs("NEON Site IDs", y="Site",x="Site ID")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
knitr::include_url("data/lab14/sankey-NEON_MAG_co_pavian.txt.html")
Supplemental Figure 4: Sankey plot of all NEON combined assembly MAGs
knitr::include_url("data/lab14/sankey-NEON_MAG_co_Gpro(1).html")
library(tidyverse)
library(knitr)
library(DT)
iris_setosa <- iris %>%
filter(Species=="setosa") %>%
filter(Sepal.Length>5)
kable(iris_setosa)
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 5.4 | 3.7 | 1.5 | 0.2 | setosa |
| 5.8 | 4.0 | 1.2 | 0.2 | setosa |
| 5.7 | 4.4 | 1.5 | 0.4 | setosa |
| 5.4 | 3.9 | 1.3 | 0.4 | setosa |
| 5.1 | 3.5 | 1.4 | 0.3 | setosa |
| 5.7 | 3.8 | 1.7 | 0.3 | setosa |
| 5.1 | 3.8 | 1.5 | 0.3 | setosa |
| 5.4 | 3.4 | 1.7 | 0.2 | setosa |
| 5.1 | 3.7 | 1.5 | 0.4 | setosa |
| 5.1 | 3.3 | 1.7 | 0.5 | setosa |
| 5.2 | 3.5 | 1.5 | 0.2 | setosa |
| 5.2 | 3.4 | 1.4 | 0.2 | setosa |
| 5.4 | 3.4 | 1.5 | 0.4 | setosa |
| 5.2 | 4.1 | 1.5 | 0.1 | setosa |
| 5.5 | 4.2 | 1.4 | 0.2 | setosa |
| 5.5 | 3.5 | 1.3 | 0.2 | setosa |
| 5.1 | 3.4 | 1.5 | 0.2 | setosa |
| 5.1 | 3.8 | 1.9 | 0.4 | setosa |
| 5.1 | 3.8 | 1.6 | 0.2 | setosa |
| 5.3 | 3.7 | 1.5 | 0.2 | setosa |
#data table from object
datatable(iris_setosa)
#data table in one piping
datatable(
iris %>%
filter(Species=="setosa") %>%
filter(Sepal.Length>5))
NEON_MAGs <- read_csv("/cloud/project/data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(NEON_MAGs)
## # A tibble: 6 × 19
## `Bin ID` `Genome Name` `IMG Genome ID` `Bin Quality` `Bin Lineage`
## <chr> <chr> <dbl> <chr> <chr>
## 1 3300060643_14 Terrestrial soil mi… 3300060643 MQ <NA>
## 2 3300060643_16 Terrestrial soil mi… 3300060643 MQ Bacteria
## 3 3300060643_18 Terrestrial soil mi… 3300060643 MQ Bacteria; Ac…
## 4 3300060643_2 Terrestrial soil mi… 3300060643 MQ Bacteria; Ac…
## 5 3300060643_28 Terrestrial soil mi… 3300060643 MQ Bacteria; Ps…
## 6 3300060643_35 Terrestrial soil mi… 3300060643 MQ Bacteria; Ac…
## # ℹ 14 more variables: `GTDB-Tk Taxonomy Lineage` <chr>, `Bin Methods` <chr>,
## # `Created By` <chr>, `Date Added` <date>, `Bin Completeness` <dbl>,
## # `Bin Contamination` <dbl>, `Total Number of Bases` <dbl>, `5s rRNA` <dbl>,
## # `16s rRNA` <dbl>, `23s rRNA` <dbl>, `tRNA Genes` <dbl>, `Gene Count` <dbl>,
## # `Scaffold Count` <dbl>, `GOLD Study ID` <chr>
str(NEON_MAGs)
## spc_tbl_ [1,754 × 19] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Bin ID : chr [1:1754] "3300060643_14" "3300060643_16" "3300060643_18" "3300060643_2" ...
## $ Genome Name : chr [1:1754] "Terrestrial soil microbial communities from National Grasslands LBJ, Texas, USA - CLBJ_001-M-20210506-comp-1" "Terrestrial soil microbial communities from National Grasslands LBJ, Texas, USA - CLBJ_001-M-20210506-comp-1" "Terrestrial soil microbial communities from National Grasslands LBJ, Texas, USA - CLBJ_001-M-20210506-comp-1" "Terrestrial soil microbial communities from National Grasslands LBJ, Texas, USA - CLBJ_001-M-20210506-comp-1" ...
## $ IMG Genome ID : num [1:1754] 3.3e+09 3.3e+09 3.3e+09 3.3e+09 3.3e+09 ...
## $ Bin Quality : chr [1:1754] "MQ" "MQ" "MQ" "MQ" ...
## $ Bin Lineage : chr [1:1754] NA "Bacteria" "Bacteria; Actinomycetota; Actinomycetes" "Bacteria; Actinomycetota; Actinomycetes" ...
## $ GTDB-Tk Taxonomy Lineage: chr [1:1754] "d__Bacteria;p__Acidobacteriota;c__Blastocatellia;o__Pyrinomonadales;f__Pyrinomonadaceae;g__PSRF01;s__" "d__Bacteria;p__Acidobacteriota;c__Vicinamibacteria;o__Vicinamibacterales;f__UBA2999;g__Gp6-AA45;s__" "d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Streptosporangiales;f__Streptosporangiaceae;g__Chersky-822;s__" "d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Mycobacteriales;f__Jatrophihabitantaceae;g__;s__" ...
## $ Bin Methods : chr [1:1754] "MetaBAT v2:2.15, CheckM v1.2.1, GTDB-tk v2.3.2, GTDB database release R214" "MetaBAT v2:2.15, CheckM v1.2.1, GTDB-tk v2.3.2, GTDB database release R214" "MetaBAT v2:2.15, CheckM v1.2.1, GTDB-tk v2.3.2, GTDB database release R214" "MetaBAT v2:2.15, CheckM v1.2.1, GTDB-tk v2.3.2, GTDB database release R214" ...
## $ Created By : chr [1:1754] "IMG_PIPELINE_JLB" "IMG_PIPELINE_JLB" "IMG_PIPELINE_JLB" "IMG_PIPELINE_JLB" ...
## $ Date Added : Date[1:1754], format: "2024-04-21" "2024-04-21" ...
## $ Bin Completeness : num [1:1754] 96.2 77.5 77.2 58.4 68.7 ...
## $ Bin Contamination : num [1:1754] 2.56 5.3 1.99 3.74 4.67 0 2.97 3.16 1.71 5.17 ...
## $ Total Number of Bases : num [1:1754] 6247032 5394623 4389455 3228217 3245901 ...
## $ 5s rRNA : num [1:1754] 0 0 0 0 0 1 3 0 1 0 ...
## $ 16s rRNA : num [1:1754] 1 0 0 0 0 0 1 1 0 0 ...
## $ 23s rRNA : num [1:1754] 0 0 0 0 0 1 1 0 1 0 ...
## $ tRNA Genes : num [1:1754] 54 32 35 29 12 26 24 37 47 34 ...
## $ Gene Count : num [1:1754] 5373 5406 4705 3762 3446 ...
## $ Scaffold Count : num [1:1754] 39 878 607 592 474 386 270 547 10 186 ...
## $ GOLD Study ID : chr [1:1754] "Gs0161344" "Gs0161344" "Gs0161344" "Gs0161344" ...
## - attr(*, "spec")=
## .. cols(
## .. `Bin ID` = col_character(),
## .. `Genome Name` = col_character(),
## .. `IMG Genome ID` = col_double(),
## .. `Bin Quality` = col_character(),
## .. `Bin Lineage` = col_character(),
## .. `GTDB-Tk Taxonomy Lineage` = col_character(),
## .. `Bin Methods` = col_character(),
## .. `Created By` = col_character(),
## .. `Date Added` = col_date(format = ""),
## .. `Bin Completeness` = col_double(),
## .. `Bin Contamination` = col_double(),
## .. `Total Number of Bases` = col_double(),
## .. `5s rRNA` = col_double(),
## .. `16s rRNA` = col_double(),
## .. `23s rRNA` = col_double(),
## .. `tRNA Genes` = col_double(),
## .. `Gene Count` = col_double(),
## .. `Scaffold Count` = col_double(),
## .. `GOLD Study ID` = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#remove combined assembly
NEON_MAGs_Ind <- NEON_MAGs %>%
filter(`Genome Name`!="NEON combined assembly")#use grave quotes (tilda key)
#count mags
NEON_MAGs_Ind %>%
count(`Bin Quality`,sort=TRUE)
## # A tibble: 2 × 2
## `Bin Quality` n
## <chr> <int>
## 1 MQ 1031
## 2 HQ 99
#kabl table
kable(NEON_MAGs_Ind %>%
count(`Bin Quality`))
| Bin Quality | n |
|---|---|
| HQ | 99 |
| MQ | 1031 |
#filter HQ bin quality and data table
datatable(NEON_MAGs_Ind %>%
filter(`Bin Quality`=="HQ"))
#Select GTDB taxonomy and the MAGs genome size then filter to all MAGs greater than 10,000,000 bases
kable(NEON_MAGs_Ind%>%
select(c(`GTDB-Tk Taxonomy Lineage`,`Total Number of Bases`)) %>%
filter(`Total Number of Bases`>10000000))
| GTDB-Tk Taxonomy Lineage | Total Number of Bases |
|---|---|
| d__Bacteria;p__Myxococcota;c__Polyangia;o__Polyangiales;f__JAFGIB01;g;s | 10348325 |
| d__Bacteria;p__Actinomycetota;c__Actinomycetia;o;f;g;s | 10920999 |
| d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;gPseudonocardia;s | 10263695 |
| d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Mycobacteriales;f__Pseudonocardiaceae;gActinophytocola;s | 12584079 |
| d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Mycobacteriales;f__Micromonosporaceae;gPlanosporangium;s | 11546507 |
| d__Bacteria;p__Actinomycetota;c__Actinomycetia;o__Streptosporangiales;f__Streptosporangiaceae;gUBA9676;s | 10351501 |
# datatable filtering on string
datatable(NEON_MAGs_Ind %>%
filter(str_detect(`GTDB-Tk Taxonomy Lineage`, "Bacteroidota")))
#filter to just Yellowstone
datatable(NEON_MAGs_Ind %>%
filter(str_detect(`Genome Name`, 'Yellowstone NP')))
#separate taxonomy by ;
NEON_MAGs_Ind_Tax <- NEON_MAGs_Ind %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"),"; ", remove=FALSE)
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 1130 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#count the Phylum contents
datatable(NEON_MAGs_Ind_Tax %>%
count(Phylum, sort=TRUE))
#separate genome name into columns
NEON_MAGs_Ind_Tax_Sample <- NEON_MAGs_Ind_Tax %>%
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>% #this mutates genome name column data to replace the string with ""
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
mutate_at("Sample Name", str_replace, "S-comp-1", "") %>%
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-",)
## Warning: Expected 3 pieces. Additional pieces discarded in 1130 rows [1, 2, 3, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
#count MAGs at each site
datatable(NEON_MAGs_Ind_Tax_Sample %>%
count(Site, sort = TRUE))
Use view(iris) to see the whole data table. Subset the table based on a different species than was used in the example. Display the table using DT::datatable
#view(iris)
datatable(iris %>%
filter(Species=="virginica"))
Display using DT::datatable the NEON MAGs from the individual assemblies that have at least 1 16S rRNA
datatable(NEON_MAGs_Ind %>%
filter(`16s rRNA`>0))
Display a table of the MAGs from Lower Teakettle with only the columns for the Genome Name, GTDB-Tk Taxonomy Lineage, and estimated MAG genome size.
datatable(NEON_MAGs_Ind_Tax %>%
filter(str_detect(`Genome Name`, "Lower Teakettle")) %>%
mutate(`Estimated Genome Size (Kbp)`=as.integer(`Total Number of Bases`/(`Bin Completeness`/100)/1000)) %>%
select(c("Genome Name","GTDB-Tk Taxonomy Lineage","Estimated Genome Size (Kbp)")))
Display a table with the Class counts at LBJ National Grasslands
datatable(NEON_MAGs_Ind_Tax_Sample %>%
filter(str_detect(Site, "National Grasslands LBJ")) %>%
count(Class, sort=TRUE))
Display a table with the counts for the Phylum Actinobacteriota at each Site
datatable(NEON_MAGs_Ind_Tax_Sample %>%
filter(Phylum=="Actinobacteriota") %>%
count(Site, sort=TRUE))
library(tidyverse)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(viridisLite)
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color=Species, shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color=Species, shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species") +theme_classic()
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="red", shape=Species))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_manual(values=c("blue", "purple", "red"))+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_brewer(palette="dark2")+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
## Warning: Unknown palette: "dark2"
ggplot(data=iris,aes(x=Sepal.Length, y=Sepal.Width))+geom_point(aes(color="species", shape=Species))+scale_color_viridis_d()+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width", color="Plant Species", shape="Plant Species")
pdf("images/iris_example_plot1.pdf", width=6, height=3)
ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()+labs(title="Iris Sepal Length vs Wide", x="Sepal Length", y="Sepal Width")
dev.off()
## png
## 2
ppi <- 300
png("images/iris_example_plot2.png", width=6*ppi, height=4*ppi, res=ppi)
ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()
dev.off()
## png
## 2
ggplotly(ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point())
p <- ggplot(data=iris, aes(x=Sepal.Length, y=Sepal.Width, color=Species))+geom_point()
ggplotly(p)
#Format data set
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_MAGs_bact_ind <- NEON_MAGs %>%
filter(Domain=="Bacteria") %>%
filter(`Assembly Type`=="Individual")
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar()+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum)))+geom_bar()+coord_flip()+labs(x="Phylum") #order by count
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x = reorder(Phylum, n), y = n)) +
geom_col(stat = "identity") +
coord_flip()
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
NEON_MAGs_bact_ind %>%
count(Phylum) %>%
ggplot(aes(x=reorder(Phylum, n), y=n))+geom_col(stat="identity")+coord_flip()+labs(x="Phylum")
## Warning in geom_col(stat = "identity"): Ignoring unknown parameters: `stat`
# NEON_MAGs_bact_ind %>%
# ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar()+coord_flip()+labs(x="Phylum")
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar(position="dodge")+coord_flip()+labs(x="Phylum")
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Site))+geom_bar(position=position_dodge2(width=0.9, preserve= "single"))+coord_flip()
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar(position=position_dodge2(width=0.9, preserve="single"))+coord_flip()+facet_wrap(vars(Site), scales="free_y", ncol=2)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=`Total Number of Bases`))+geom_histogram(bins=50)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum), y=`Total Number of Bases`))+geom_boxplot()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))+labs(x="Phylum")
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_infreq(Phylum), y=`Total Number of Bases`))+geom_point()+coord_flip()+labs(x="Phylum")
What are the overall class MAG counts?
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Class)), fill=Class))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Overall MAG Count per Class", x="Class", y="MAG Count")+theme_classic()
What are the MAG counts for each subplot. Color by site ID.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Subplot)), fill=`Site ID`))+geom_bar()+coord_flip()+labs(title="Overall MAG Count per Subplot", x="Subplot ID", y="MAG Count")+theme_classic()
How many novel bacteria were discovered (Show that number of NAs for each taxonomic level)?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_rev(fct_infreq(Site)), fill=Site))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Total Novel Bacteria per Site", x="Site", y="n")+theme_classic()
How many novel bacterial MAGs are high quality vs medium quality?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_infreq(`Bin Quality`), fill=`Bin Quality`))+geom_bar()+labs(title="Novel Bacteria MAG Quality", x="Quality", y="MAGs")+theme_classic()
What phyla have novel bacterial genera?
NEON_MAGs_bact_ind %>%
filter(is.na(Class) | is.na(Order) | is.na(Family) | is.na(Genus) ) %>%
ggplot(aes(x=fct_infreq(`Phylum`), fill=`Phylum`))+geom_bar(show.legend=FALSE)+coord_flip()+labs(title="Novel Genera per Phylum", x="Phylum", y="n")+theme_classic()
Make a stacked bar plot of the total number of MAGs at each site using Phylum as the fill.
NEON_MAGs_bact_ind %>%
ggplot(aes(x=fct_rev(fct_infreq(Site)), fill=Phylum))+geom_bar()+coord_flip()+labs(title="Total Number of MAGs per site", x="Site", y="MAG Count",fill="Phylum")+theme_classic()
## Exercise #7
Using facet_wrap make plots of the total number of MAGs at each site for each phylum (e.g. similar to the example above but using the site ID and separating each graph by phylum.)
NEON_MAGs_bact_ind %>%
ggplot(aes(x=Phylum))+geom_bar(aes(fill=Phylum),position=position_dodge2(width=0.9, preserve="single"),show.legend=FALSE)+coord_flip()+facet_wrap(vars(`Site ID`), scales="free_y", ncol=4)+labs(title="Total Number of Mags per Phylum at Each Site", x="Phylum", y="n")+theme_classic()
What is the relationship between MAGs genome size and the number of genes? Color by Phylum.
NEON_MAGs_bact_ind %>%
mutate(`Estimated Genome Size (Kbp)`=as.integer(`Total Number of Bases`/(`Bin Completeness`/100)/1000)) %>%
ggplot(aes(x=`Gene Count`, y=`Estimated Genome Size (Kbp)`, color=Phylum))+geom_point()+labs(title="Gene Count vs Estimated Mag Genome Size", x="Gene", y="Estimated Genome Size (Kbp)")+theme_classic()
What is the relationship between scaffold count and MAG completeness?
NEON_MAGs_bact_ind %>%
ggplot(aes(x=`Scaffold Count`, y=`Bin Completeness`, color=Phylum))+geom_point()+labs(title="Scaffold Count vs MAG Completeness", x="Scaffold Count", y="MAG Completeness")+theme_classic()
library(tidyverse)
library(plotly)
library(knitr)
library(DT)
Located 400 miles north from Fairbanks, Alaska at the foot of the Brooks mountain range, biodiversity at Toolik Field Station is heavily influenced by its harsh winters where temperatures can reach -50⁰F. It is home to a variety of fauna including caribou, loons, voles, and polar bears. Located above the northern tree line, the vegetation in the tundra here mainly consists of birch, willow, sedges and grass. The site contains a large range of soil conditions, including layers of permafrost, created by glacial action (NEON 2023).
The class Gammaproteobacteria, under the phylum Proteobacteria, is made up of around 381 genera that thrive in marine, terrestial, and eukaryotic host ecosystems (Liao et al. 2020). Historically, this class has be defined phylogenetically by 16s rRNA sequence homology (Williams and Kelly 2013). Some notable members of this class include Escherichia coli, Vibrio fischeri, and Pseudomonas aeruginosa. This class has great diversity of morphologies with rod, cocci, spirilla, and filaments all represented (Williams et al. 2010). Additionally, species in class display a variety of trophisms including chemoautotrophs and photoautotrophs (Gao, Mohan, and Gupta 2009).
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_MAGs_Toolik <- NEON_MAGs %>%
filter(str_detect(`Site`, 'Toolik Field Station'))
NEON_MAGs_Toolik_GPROTEO <- NEON_MAGs_Toolik %>%
filter(str_detect(`Class`, 'Gammaproteobacteria'))
NEON_MAGs_GPROTEO <- NEON_MAGs %>%
filter(str_detect(`Class`, 'Gammaproteobacteria'))
datatable(NEON_MAGs_Toolik)
datatable(NEON_MAGs_Toolik %>%
count(Phylum))
datatable(NEON_MAGs_Toolik %>%
count(Class))
datatable(NEON_MAGs_Toolik %>%
count(Order))
datatable(NEON_MAGs_Toolik %>%
count(Family))
datatable(NEON_MAGs_Toolik %>%
count(Genus))
NEON_MAGs_Toolik %>% #Phylum into Order
ggplot(aes(x=fct_rev(fct_infreq(Phylum)), fill=Class))+geom_bar()+coord_flip()+labs(title="Taxonomic Breakdown by Phylum of Toolik Field Station MAGs", x="Phylum", y="n", fill="Class")+theme_classic()
NEON_MAGs_Toolik %>% #Class into Order
ggplot(aes(x=Class))+geom_bar(aes(fill=`Order`),position=position_dodge2(width=0.9, preserve="single"))+coord_flip()+facet_wrap(vars(`Phylum`), scales="free_y", ncol=2)+labs(title="Taxonomic Breakdown by Class of Toolik Field Station MAGs", x="Class", y="n", fill="Order")+theme_classic()
NEON_MAGs_Toolik %>% #Order into family
ggplot(aes(x=Order))+geom_bar(aes(fill=`Family`),position=position_dodge2(width=0.9, preserve="single"))+coord_flip()+facet_wrap(vars(`Class`), scales="free_y", ncol=2)+labs(title="Taxonomic Breakdown by Order of Toolik Field Station MAGs", x="Order", y="n", fill="Family")+theme_classic()
NEON_MAGs_Toolik %>% #Family into Genus
ggplot(aes(x=Family))+geom_bar(aes(fill=`Genus`),position=position_dodge2(width=0.9, preserve="single"))+coord_flip()+facet_wrap(vars(`Class`), scales="free_y", ncol=2)+labs(title="Taxonomic Breakdown by Family of Toolik Field Station MAGs", x="Family", y="n", fill="Genus")+theme_classic()
NEON_MAGs_GPROTEO %>% #Site Locations
filter(!str_detect(`Site`, "NEON combined assembly")) %>%
ggplot(aes(x=fct_rev(fct_infreq(`Site`)), fill=`Order`))+geom_bar()+coord_flip()+labs(title="Gammaproteobacteria MAGs Sample Locations", x="Site", y="n", fill="Order")+theme_classic()
datatable(NEON_MAGs_GPROTEO)
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
# filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [101].
## Warning: Expected 3 pieces. Additional pieces discarded in 85 rows [2, 4, 10, 12, 16,
## 17, 19, 20, 21, 22, 23, 24, 29, 30, 31, 33, 34, 35, 36, 37, ...].
NEON_META_Toolik <- NEON_metagenomes %>%
filter(str_detect(`Sample Name`,'re-annotation')) %>%
filter(str_detect(`Site`,'Toolik'))
datatable(NEON_META_Toolik)
library(tidyverse)
library(knitr)
NEON_MAGs <- read_csv("/cloud/project/data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
# NEON_MAGs <- read_csv("~/Bio 676/Lab12/data/GOLD_Study_ID_Gs0161344_NEON_edArchaea.csv") %>%
# # remove columns that are not needed for data analysis
# select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>%
# # create a new column with the Assembly Type
# mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
# TRUE ~ "Individual")) %>%
# mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
# separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>%
# # Get rid of the the common string "Soil microbial communities from "
# mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# # Use the first `-` to split the column in two
# separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# # Get rid of the the common string "S-comp-1"
# mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# # separate the Sample Name into Site ID and plot info
# separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# # separate the plot info into 3 columns
# separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_metagenomes <- read_tsv("/cloud/project/data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
NEON_chemistry <- read_tsv("/cloud/project/data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date (1): collectionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(
NEON_chemistry_description <- read_tsv("/cloud/project/data/NEON/neon_soilChem1_metadata_descriptions.tsv")
)
## Rows: 23 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): fieldName, description, dataType, units
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| fieldName | description | dataType | units |
|---|---|---|---|
| siteID | NEON site code | string | NA |
| plotID | Plot identifier (NEON site code_XXX) | string | NA |
| sampleID | Identifier for sample | string | NA |
| horizon | Organic or mineral soil | string | NA |
| genomicsSampleID | Identifier for a genomics sample | string | NA |
| d15N | Measure of the ratio of 15N:14N in a sample, relative to atmospheric N2 | real | permill |
| organicd13C | Measure of the ratio of 13C:12C in soil organic carbon, relative to Vienna Pee Dee Belemnite | real | permill |
| nitrogenPercent | Percent nitrogen in a sample on a dry weight basis | real | percent |
| organicCPercent | Percent organic carbon in a sample on a dry weight basis | real | percent |
| CNratio | Ratio of carbon to nitrogen concentration in a sample on a dry weight basis | real | NA |
| nlcdClass | National Land Cover Database Vegetation Type Name | string | NA |
| subplotID | Identifier for the NEON subplot | string | NA |
| coreCoordinateX | x location of the soil core relative to the SW corner | real | meter |
| coreCoordinateY | y location of the soil core relative to the SW corner | real | meter |
| decimalLatitude | The geographic latitude (in decimal degrees, WGS84) of the geographic center of the reference area | real | decimalDegree |
| decimalLongitude | The geographic longitude (in decimal degrees, WGS84) of the geographic center of the reference area | real | decimalDegree |
| elevation | Elevation (in meters) above sea level | real | meter |
| sampleTiming | Timing of the sampling event with regard to the field season | string | NA |
| soilTemp | In-situ temperature of soil at approximately 10 cm depth | real | degree |
| sampleTopDepth | Depth below the soil surface of the top of a soil sample | real | centimeter |
| sampleBottomDepth | Depth below the soil surface of the bottom of a soil sample | real | centimeter |
| soilInWaterpH | pH value of soil measured in water solution | real | pH |
| soilInCaClpH | pH value of soil measured in calcium chloride solution | real | pH |
band_members
## # A tibble: 3 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_instruments
## # A tibble: 3 × 2
## name plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
# drops any row in the second data set that does not match a row in the first data set.
band_members %>%
left_join(band_instruments, by="name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
# drops any row in the first data set that does not match a row in the second data set.
band_members %>%
right_join(band_instruments, by="name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
## 3 Keith <NA> guitar
#drops any row in either data set that does not have a match in both data sets
band_members %>%
inner_join(band_instruments, by="name")
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
#retains every row from both data sets; it is the only join guaranteed to retain all of the original data
band_members %>%
full_join(band_instruments, by="name")
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
#join by two specified columns
table1 %>%
left_join(table3, by = c("country", "year"))
## # A tibble: 6 × 5
## country year cases population rate
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 Afghanistan 1999 745 19987071 745/19987071
## 2 Afghanistan 2000 2666 20595360 2666/20595360
## 3 Brazil 1999 37737 172006362 37737/172006362
## 4 Brazil 2000 80488 174504898 80488/174504898
## 5 China 1999 212258 1272915272 212258/1272915272
## 6 China 2000 213766 1280428583 213766/1280428583
#join by columns with different names
#For each element,
# Write the name of the column that appears in the first data frame
# Write an equals sign
# Write the name of the matching column that appears in the second data set.
band_members %>%
left_join(band_instruments2, by = c(name = "artist"))
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
#tables with multiple same name columns that were not joined on will be labeled x,y
table4a %>%
left_join(table4b, by = "country")
## # A tibble: 3 × 5
## country `1999.x` `2000.x` `1999.y` `2000.y`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 745 2666 19987071 20595360
## 2 Brazil 37737 80488 172006362 174504898
## 3 China 212258 213766 1272915272 1280428583
# set suffix
table4a %>%
left_join(table4b, by = "country", suffix = c("_cases", "_pop"))
## # A tibble: 3 × 5
## country `1999_cases` `2000_cases` `1999_pop` `2000_pop`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 745 2666 19987071 20595360
## 2 Brazil 37737 80488 172006362 174504898
## 3 China 212258 213766 1272915272 1280428583
Create some tables with just a few columns to work on the basics of table joins by making a new data frame selecting In NEON_MAGs the columns Sample Name, Site ID, GTDB-Tk Taxonomy Lineage In NEON_metagenomes the columns Sample Name, Site ID, Ecosystem Subtype In NEON_chemistry the columns genomicsSampleID, siteID, nlcdClass
#In NEON_MAGs the columns Sample Name, Site ID, GTDB-Tk Taxonomy Lineage
NEON_MAGs_Short <- NEON_MAGs %>%
select(c(`Sample Name`,`Site ID`,`GTDB-Tk Taxonomy Lineage`))
#In NEON_metagenomes the columns Sample Name, Site ID, Ecosystem Subtype
NEON_metagenomes_Short <- NEON_metagenomes %>%
select(c(`Sample Name`, `Site ID`, `Ecosystem Subtype`))
#In NEON_chemistry the columns genomicsSampleID, siteID, nlcdClass
NEON_chemistry_Short <- NEON_chemistry %>%
select(c(`genomicsSampleID`, `siteID`, `nlcdClass`))
Now filter the above NEON_MAGs, NEON_metagenomes and NEON_chemistry to contain just the data for your project site
NEON_MAGs_Short_TOOL <-NEON_MAGs_Short %>%
filter(str_detect(`Site ID`,"TOOL"))
NEON_metagenomes_Short_TOOL <-NEON_metagenomes_Short %>%
filter(str_detect(`Site ID`,"TOOL"))
NEON_chemistry_Short_TOOL <-NEON_chemistry_Short %>%
filter(str_detect(`siteID`,"TOOL"))
Do a left join of the NEON MAGs with NEON metagenomes by the Sample Name and show the resulting table. Notice what happens when 2 columns have the same name. Did you get the number of rows you expected?
NEON_MAGs_Short %>%
left_join(NEON_metagenomes_Short, by="Sample Name" )
## # A tibble: 1,754 × 5
## `Sample Name` `Site ID.x` `GTDB-Tk Taxonomy Lineage` `Site ID.y`
## <chr> <chr> <chr> <chr>
## 1 CLBJ_001-M-20210506 CLBJ Bacteria;Acidobacteriota;Blastoc… CLBJ
## 2 CLBJ_001-M-20210506 CLBJ Bacteria;Acidobacteriota;Vicinam… CLBJ
## 3 CLBJ_001-M-20210506 CLBJ Bacteria;Actinomycetota;Actinomy… CLBJ
## 4 CLBJ_001-M-20210506 CLBJ Bacteria;Actinomycetota;Actinomy… CLBJ
## 5 CLBJ_001-M-20210506 CLBJ Bacteria;Pseudomonadota;Alphapro… CLBJ
## 6 CLBJ_001-M-20210506 CLBJ Bacteria;Acidobacteriota;Blastoc… CLBJ
## 7 CLBJ_001-M-20210506 CLBJ Bacteria;Chloroflexota;Ktedonoba… CLBJ
## 8 CLBJ_001-M-20210506 CLBJ Bacteria;Acidobacteriota;Blastoc… CLBJ
## 9 BONA_009-O-20210707 BONA Bacteria;Actinomycetota;Thermole… BONA
## 10 BONA_009-O-20210707 BONA Bacteria;Acidobacteriota;Terrigl… BONA
## # ℹ 1,744 more rows
## # ℹ 1 more variable: `Ecosystem Subtype` <chr>
Using the data from your site do a left join of the NEON chemistry with NEON metagenomes from above by the Sample Name and genomicsSampleID columns and show the resulting table. Use by = c(“Sample Name” = “genomicsSampleID”)). Did you get the number of rows you expected?
NEON_metagenomes_Short_TOOL %>%
left_join(NEON_chemistry_Short_TOOL,by = c("Sample Name" = "genomicsSampleID"))
## # A tibble: 9 × 5
## `Sample Name` `Site ID` `Ecosystem Subtype` siteID nlcdClass
## <chr> <chr> <chr> <chr> <chr>
## 1 TOOL_002-O-20210804 TOOL Tundra TOOL dwarfScrub
## 2 TOOL_041-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 3 TOOL_003-O-20210805 TOOL Tundra TOOL sedgeHerbaceous
## 4 TOOL_005-O-20210806 TOOL Tundra TOOL dwarfScrub
## 5 TOOL_042-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 6 TOOL_006-O-20210804 TOOL Tundra TOOL shrubScrub
## 7 TOOL_044-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 8 TOOL_004-O-20210805 TOOL Tundra TOOL dwarfScrub
## 9 TOOL_043-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
Does it matter with these tables if your do a left, right or full join. Show the resulting tables
NEON_metagenomes_Short_TOOL %>%
left_join(NEON_chemistry_Short_TOOL,by = c("Sample Name" = "genomicsSampleID"))
## # A tibble: 9 × 5
## `Sample Name` `Site ID` `Ecosystem Subtype` siteID nlcdClass
## <chr> <chr> <chr> <chr> <chr>
## 1 TOOL_002-O-20210804 TOOL Tundra TOOL dwarfScrub
## 2 TOOL_041-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 3 TOOL_003-O-20210805 TOOL Tundra TOOL sedgeHerbaceous
## 4 TOOL_005-O-20210806 TOOL Tundra TOOL dwarfScrub
## 5 TOOL_042-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 6 TOOL_006-O-20210804 TOOL Tundra TOOL shrubScrub
## 7 TOOL_044-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 8 TOOL_004-O-20210805 TOOL Tundra TOOL dwarfScrub
## 9 TOOL_043-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
NEON_metagenomes_Short_TOOL %>%
right_join(NEON_chemistry_Short_TOOL,by = c("Sample Name" = "genomicsSampleID"))
## # A tibble: 9 × 5
## `Sample Name` `Site ID` `Ecosystem Subtype` siteID nlcdClass
## <chr> <chr> <chr> <chr> <chr>
## 1 TOOL_002-O-20210804 TOOL Tundra TOOL dwarfScrub
## 2 TOOL_041-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 3 TOOL_003-O-20210805 TOOL Tundra TOOL sedgeHerbaceous
## 4 TOOL_005-O-20210806 TOOL Tundra TOOL dwarfScrub
## 5 TOOL_042-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 6 TOOL_006-O-20210804 TOOL Tundra TOOL shrubScrub
## 7 TOOL_044-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 8 TOOL_004-O-20210805 TOOL Tundra TOOL dwarfScrub
## 9 TOOL_043-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
NEON_metagenomes_Short_TOOL %>%
full_join(NEON_chemistry_Short_TOOL,by = c("Sample Name" = "genomicsSampleID"))
## # A tibble: 9 × 5
## `Sample Name` `Site ID` `Ecosystem Subtype` siteID nlcdClass
## <chr> <chr> <chr> <chr> <chr>
## 1 TOOL_002-O-20210804 TOOL Tundra TOOL dwarfScrub
## 2 TOOL_041-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 3 TOOL_003-O-20210805 TOOL Tundra TOOL sedgeHerbaceous
## 4 TOOL_005-O-20210806 TOOL Tundra TOOL dwarfScrub
## 5 TOOL_042-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 6 TOOL_006-O-20210804 TOOL Tundra TOOL shrubScrub
## 7 TOOL_044-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
## 8 TOOL_004-O-20210805 TOOL Tundra TOOL dwarfScrub
## 9 TOOL_043-O-20210803 TOOL Tundra TOOL sedgeHerbaceous
Do a left join of the NEON chemistry with NEON metagenomes by site ID columns and show the resulting table. Did you get the number of rows you expected?
NEON_chemistry_Short %>%
left_join(NEON_metagenomes_Short, by =c("siteID"="Site ID"))
## Warning in left_join(., NEON_metagenomes_Short, by = c(siteID = "Site ID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 12 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## # A tibble: 635 × 5
## genomicsSampleID siteID nlcdClass `Sample Name` `Ecosystem Subtype`
## <chr> <chr> <chr> <chr> <chr>
## 1 GUAN_048-M-20210920 GUAN evergreenForest GUAN_048-M-20… Tropical forest
## 2 GUAN_048-M-20210920 GUAN evergreenForest GUAN_042-M-20… Tropical forest
## 3 GUAN_048-M-20210920 GUAN evergreenForest GUAN_006-M-20… Tropical forest
## 4 GUAN_048-M-20210920 GUAN evergreenForest GUAN_003-M-20… Tropical forest
## 5 GUAN_048-M-20210920 GUAN evergreenForest GUAN_004-M-20… Tropical forest
## 6 GUAN_048-M-20210920 GUAN evergreenForest GUAN_043-M-20… Tropical forest
## 7 GUAN_048-M-20210920 GUAN evergreenForest GUAN_007-M-20… Tropical forest
## 8 GUAN_042-M-20210920 GUAN evergreenForest GUAN_048-M-20… Tropical forest
## 9 GUAN_042-M-20210920 GUAN evergreenForest GUAN_042-M-20… Tropical forest
## 10 GUAN_042-M-20210920 GUAN evergreenForest GUAN_006-M-20… Tropical forest
## # ℹ 625 more rows
Join the NEON MAG, metagenome and chemistry dataframes into a single data frame. What happens to the metagenome and chemistry information on the rows with the NEON coassembly?
Chemistry data for coassembly rows is largely left blank metagenome data is all the same for coassembly rows
NEON_chemistry_7 <-NEON_chemistry %>%
full_join(NEON_metagenomes, by =c("genomicsSampleID"="Sample Name")) %>%
full_join(NEON_MAGs, by= c("genomicsSampleID"="Sample Name"))
NEON_chemistry_7
## # A tibble: 1,756 × 93
## genomicsSampleID siteID plotID nlcdClass decimalLatitude decimalLongitude
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 2 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 3 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 4 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 5 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 6 GUAN_048-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 7 GUAN_042-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 8 GUAN_042-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 9 GUAN_042-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## 10 GUAN_042-M-20210920 GUAN GUAN_0… evergree… 18.0 -66.9
## # ℹ 1,746 more rows
## # ℹ 87 more variables: elevation <dbl>, collectionDate <date>, horizon <chr>,
## # soilTemp <dbl>, d15N <dbl>, organicd13C <dbl>, nitrogenPercent <dbl>,
## # organicCPercent <dbl>, CNratio <dbl>, soilInWaterpH <dbl>,
## # soilInCaClpH <dbl>, taxon_oid <dbl>, Domain.x <chr>,
## # `Sequencing Status` <chr>, `Study Name` <chr>, Site.x <chr>,
## # `Site ID.x` <chr>, Subplot.x <chr>, Layer.x <chr>, Date.x <chr>, …
NEON_chemistry_7 %>%
filter(str_detect(`Assembly Type`, "Combined"))
## # A tibble: 624 × 93
## genomicsSampleID siteID plotID nlcdClass decimalLatitude decimalLongitude
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 <NA> <NA> <NA> <NA> NA NA
## 2 <NA> <NA> <NA> <NA> NA NA
## 3 <NA> <NA> <NA> <NA> NA NA
## 4 <NA> <NA> <NA> <NA> NA NA
## 5 <NA> <NA> <NA> <NA> NA NA
## 6 <NA> <NA> <NA> <NA> NA NA
## 7 <NA> <NA> <NA> <NA> NA NA
## 8 <NA> <NA> <NA> <NA> NA NA
## 9 <NA> <NA> <NA> <NA> NA NA
## 10 <NA> <NA> <NA> <NA> NA NA
## # ℹ 614 more rows
## # ℹ 87 more variables: elevation <dbl>, collectionDate <date>, horizon <chr>,
## # soilTemp <dbl>, d15N <dbl>, organicd13C <dbl>, nitrogenPercent <dbl>,
## # organicCPercent <dbl>, CNratio <dbl>, soilInWaterpH <dbl>,
## # soilInCaClpH <dbl>, taxon_oid <dbl>, Domain.x <chr>,
## # `Sequencing Status` <chr>, `Study Name` <chr>, Site.x <chr>,
## # `Site ID.x` <chr>, Subplot.x <chr>, Layer.x <chr>, Date.x <chr>, …
Filter the above table to contain data for just your project taxonomic group. Make a boxplot of the soil temperatures for each sample at the sites.
NEON_chemistry_8 <-NEON_chemistry_7 %>%
filter(str_detect(`GTDB-Tk Taxonomy Lineage`,"Gammaproteobacteria"))
NEON_chemistry_8 %>%
ggplot(aes(x=fct_infreq(siteID), y=soilTemp))+geom_boxplot()+labs(title="Sample Soil Temperatures at each site", x="Site ID", y="Soil Temp (C)")+theme_classic()
## Warning: Removed 39 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
Make a scatterplot of Ecosystem Subtype vs temperature. Use the Order as the color for the points.
NEON_chemistry_8 %>%
ggplot(aes(x=fct_infreq(`Ecosystem Subtype`), y=soilTemp, color=Order))+geom_point()+labs(title="Sample Soil Temperatures vs Ecosystem Subtype", x="Ecosystem Subtype", y="Soil Temp (C)")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
## Warning: Removed 39 rows containing missing values or values outside the scale range
## (`geom_point()`).
Make a scatterplots of soilInCaClpH vs ncldClass (National Land Cover Database) terms. Use the Family as the color for the points.
NEON_chemistry_8 %>%
ggplot(aes(x=`nlcdClass`,y=`soilInCaClpH`, color=Family))+geom_point()+labs(title="Sample Soil Temperatures vs National Land Cover Class", x="Land Cover Class", y="Soil pH in CaCl")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
## Warning: Removed 39 rows containing missing values or values outside the scale range
## (`geom_point()`).
Exploring the data is an important part of data analysis. Humans are great at visually recognizing patterns. Make 3 other graphs specific to your project.
NEON_chemistry_11 <- NEON_chemistry_7 %>%
filter(str_detect(siteID,"TOOL"))
NEON_chemistry_11
## # A tibble: 149 × 93
## genomicsSampleID siteID plotID nlcdClass decimalLatitude decimalLongitude
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 2 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 3 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 4 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 5 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 6 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 7 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 8 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 9 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## 10 TOOL_041-O-20210803 TOOL TOOL_0… sedgeHer… 68.7 -149.
## # ℹ 139 more rows
## # ℹ 87 more variables: elevation <dbl>, collectionDate <date>, horizon <chr>,
## # soilTemp <dbl>, d15N <dbl>, organicd13C <dbl>, nitrogenPercent <dbl>,
## # organicCPercent <dbl>, CNratio <dbl>, soilInWaterpH <dbl>,
## # soilInCaClpH <dbl>, taxon_oid <dbl>, Domain.x <chr>,
## # `Sequencing Status` <chr>, `Study Name` <chr>, Site.x <chr>,
## # `Site ID.x` <chr>, Subplot.x <chr>, Layer.x <chr>, Date.x <chr>, …
NEON_chemistry_11 %>%
ggplot(aes(x=`Phylum`, fill=Class))+geom_bar()+coord_flip()+facet_wrap(vars(nlcdClass), scales="free_y", ncol=3)+labs(title="Toolik Field Station Phylogeny vs National Land Cover Class", x="Land Cover Class", y="n")+theme_classic()
NEON_chemistry_8 %>%
ggplot(aes(y=`soilTemp`,x=`soilInWaterpH`, color=Order))+geom_point()+labs(title="Gammaproteobacteria Sample Soil Temperature vs pH in Water", y="Soil Temperature", x="Soil pH in Water")+theme_classic()
## Warning: Removed 39 rows containing missing values or values outside the scale range
## (`geom_point()`).
NEON_chemistry_8 %>%
ggplot(aes(y=`Elevation In Meters`, x=Order, color=Order))+geom_boxplot(show.legend = FALSE)+labs(title="Gammaproteobacteria MAGs vs Elevation", x="Order", y="Elevation (m)")+theme_classic()+theme(axis.text.x=element_text(angle=45, vjust=1, hjust=1))
library(tidyverse)
library(zoo)
library(knitr)
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_edArchaea.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus"), "; ", remove = FALSE) %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 6 pieces. Additional pieces discarded in 46 rows [3, 4, 24, 25, 26,
## 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 54, 232, 267, ...].
## Warning: Expected 6 pieces. Missing pieces filled with `NA` in 446 rows [1, 2, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 46, 50, 53, ...].
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [4, 7, 8, 236,
## 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
## ...].
NEON_MAGs_ind <- NEON_MAGs %>%
filter(`Assembly Type` == "Individual")
NEON_MAGs_co <- NEON_MAGs %>%
filter(`Assembly Type` == "Combined")
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_ind %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
# NAs are likely Archaea
replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>%
# Pavian format requires p__ etc
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 1093 rows [1, 2, 4, 5, 6,
## 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 36, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))
# Put the data into a format that can be read by the Sankey App
sankey_data <- sankey_data %>%
unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>%
mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>%
mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%
mutate_at("classification", str_replace, "; ", "|p__") %>%
mutate_at("classification", str_replace, "; ", "|c__") %>%
mutate_at("classification", str_replace, "; ", "|o__") %>%
mutate_at("classification", str_replace, "; ", "|f__") %>%
mutate_at("classification", str_replace, "; ", "|g__") %>%
mutate_at("classification", str_replace, "; ", "|s__")
# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data
sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)
sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>%
mutate(classification = as.factor(classification)) %>%
count(classification) %>%
# rename for Pavian format
rename(`#SampleID` = `classification`) %>%
rename(`Metaphlan2_Analysis` = `n`)
# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "NEON_MAG_ind_pavian.txt")
knitr::include_url("images/lab14/sankey-NEON_MAG_ind_pavian.txt.html")

knitr::include_url("images/lab14/sankey-NEON_MAG_co_pavian.txt.html")
NEON_MAGs_ind_GPro <- NEON_MAGs %>%
filter(`Assembly Type` == "Individual") %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_ind_GPro %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
# NAs are likely Archaea
replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>%
# Pavian format requires p__ etc
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 83 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))
# Put the data into a format that can be read by the Sankey App
sankey_data <- sankey_data %>%
unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>%
mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>%
mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%
mutate_at("classification", str_replace, "; ", "|p__") %>%
mutate_at("classification", str_replace, "; ", "|c__") %>%
mutate_at("classification", str_replace, "; ", "|o__") %>%
mutate_at("classification", str_replace, "; ", "|f__") %>%
mutate_at("classification", str_replace, "; ", "|g__") %>%
mutate_at("classification", str_replace, "; ", "|s__")
# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data
sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)
sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>%
mutate(classification = as.factor(classification)) %>%
count(classification) %>%
# rename for Pavian format
rename(`#SampleID` = `classification`) %>%
rename(`Metaphlan2_Analysis` = `n`)
# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "NEON_MAG_ind_Gpro.txt")
knitr::include_url("images/lab14/sankey-NEON_MAG_ind_Gpro.txt.html")
NEON_MAGs_co_GPro <- NEON_MAGs %>%
filter(`Assembly Type` == "Combined") %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_co_GPro %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
# NAs are likely Archaea
replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>%
# Pavian format requires p__ etc
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 39 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))
# Put the data into a format that can be read by the Sankey App
sankey_data <- sankey_data %>%
unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>%
mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>%
mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%
mutate_at("classification", str_replace, "; ", "|p__") %>%
mutate_at("classification", str_replace, "; ", "|c__") %>%
mutate_at("classification", str_replace, "; ", "|o__") %>%
mutate_at("classification", str_replace, "; ", "|f__") %>%
mutate_at("classification", str_replace, "; ", "|g__") %>%
mutate_at("classification", str_replace, "; ", "|s__")
# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data
sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)
sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>%
mutate(classification = as.factor(classification)) %>%
count(classification) %>%
# rename for Pavian format
rename(`#SampleID` = `classification`) %>%
rename(`Metaphlan2_Analysis` = `n`)
# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "NEON_MAG_co_Gpro.txt")
knitr::include_url("images/lab14/sankey-NEON_MAG_co_Gpro.txt.html")
NEON_MAGs_Toolik <- NEON_MAGs %>%
filter(str_detect(`Site ID`,"TOOL"))
# Select the GTDB Taxonomic lineage and separate into taxonomic levels
sankey_data <- NEON_MAGs_Toolik %>%
select(`GTDB-Tk Taxonomy Lineage`) %>%
# NAs are likely Archaea
replace_na(list(`GTDB-Tk Taxonomy Lineage` = 'Archaea')) %>%
# Pavian format requires p__ etc
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), "; ")
## Warning: Expected 7 pieces. Missing pieces filled with `NA` in 147 rows [1, 2, 3, 4, 5,
## 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# Fill in the NAs with the taxonomic the higher taxonomic level to the left
sankey_data[] <- t(apply(sankey_data, 1, zoo::na.locf))
# Put the data into a format that can be read by the Sankey App
sankey_data <- sankey_data %>%
unite(col = "classification", c(Domain, Phylum, Class, Order, Family, Genus, Species), sep='; ') %>%
mutate_at("classification", str_replace, "Archaea", "d__Archaea") %>%
mutate_at("classification", str_replace, "Bacteria", "d__Bacteria") %>%
mutate_at("classification", str_replace, "; ", "|p__") %>%
mutate_at("classification", str_replace, "; ", "|c__") %>%
mutate_at("classification", str_replace, "; ", "|o__") %>%
mutate_at("classification", str_replace, "; ", "|f__") %>%
mutate_at("classification", str_replace, "; ", "|g__") %>%
mutate_at("classification", str_replace, "; ", "|s__")
# Create format for Pavian with counts for each taxonomic level
sankey_data_s <- sankey_data
sankey_data_g <- sankey_data
sankey_data_f <- sankey_data
sankey_data_o <- sankey_data
sankey_data_c <- sankey_data
sankey_data_p <- sankey_data
sankey_data_d <- sankey_data
sankey_data_g$classification <- sub("\\|s__.*", "", sankey_data_g$classification)
sankey_data_f$classification <- sub("\\|g__.*", "", sankey_data_f$classification)
sankey_data_o$classification <- sub("\\|f__.*", "", sankey_data_o$classification)
sankey_data_c$classification <- sub("\\|o__.*", "", sankey_data_c$classification)
sankey_data_p$classification <- sub("\\|c__.*", "", sankey_data_p$classification)
sankey_data_d$classification <- sub("\\|p__.*", "", sankey_data_d$classification)
sankey_data_allTaxa <- bind_rows(sankey_data_s, sankey_data_g, sankey_data_f, sankey_data_o, sankey_data_c, sankey_data_p, sankey_data_d) %>%
mutate(classification = as.factor(classification)) %>%
count(classification) %>%
# rename for Pavian format
rename(`#SampleID` = `classification`) %>%
rename(`Metaphlan2_Analysis` = `n`)
# Write file to input to Pavian Sankey
write_tsv(sankey_data_allTaxa, "NEON_MAG_Toolik.txt")
knitr::include_url("images/lab14/sankey-NEON_MAG_Toolik.txt.html")
library(tidyverse)
library(knitr)
library(ggtree)
library(ggimage)
library(rphylopic)
library(treeio)
library(tidytree)
library(ape)
library(TreeTools)
library(phytools)
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Rows: 1754 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Bin ID, Genome Name, Bin Quality, Bin Lineage, GTDB-Tk Taxonomy L...
## dbl (10): IMG Genome ID, Bin Completeness, Bin Contamination, Total Number ...
## date (1): Date Added
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 624 rows [1131, 1132,
## 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145,
## 1146, 1147, 1148, 1149, 1150, ...].
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
## Rows: 176 Columns: 46
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (18): Domain, Sequencing Status, Study Name, Genome Name / Sample Name, ...
## dbl (16): taxon_oid, IMG Genome ID, Depth In Meters, Elevation In Meters, Ge...
## lgl (12): Altitude In Meters, Chlorophyll Concentration, Longhurst Code, Lon...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 1 rows [53].
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
## Rows: 87 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): genomicsSampleID, siteID, plotID, nlcdClass, horizon
## dbl (11): decimalLatitude, decimalLongitude, elevation, soilTemp, d15N, org...
## date (1): collectionDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID"))
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
# Make a vector with the internal node lables
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
# Search for your Phylum or Class
grep("Gammaproteobacteria", node_vector_bac, value = TRUE)
## [1] "'1.0:c__Gammaproteobacteria'"
match(grep("Gammaproteobacteria", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 3048
tree_bac_node_Gammaproteobacteria <- Preorder(tree_bac)
tree_Gammaproteobacteria <- Subtree(tree_bac_node_Gammaproteobacteria, 3048)
ggtree(tree_Gammaproteobacteria) +
geom_tiplab(size=3) +
xlim(0,20)
## Gammaproteobacteria Circular Tree
ggtree(tree_Gammaproteobacteria, layout="circular") +
geom_tiplab(aes(angle=angle))+
theme_tree() +
xlim(0,20)
library(tidyverse)
library(knitr)
library(ggtree)
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
library(treeio)
library(tidytree)
library(ape)
library(TreeTools)
library(phytools)
library(ggnewscale)
library(ggtreeExtra)
library(ggstar)
#library(ggeasy) #https://jonocarroll.github.io/ggeasy/
#library(DT)
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
#Toolik
NEON_MAGs_metagenomes_chemistry_TOOL<- NEON_MAGs_metagenomes_chemistry %>%
filter(`Site ID.x` == "TOOL") %>%
filter(Domain == "Bacteria")
TOOL_MAGs_label <- NEON_MAGs_metagenomes_chemistry_TOOL$label
tree_bac_TOOL_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(TOOL_MAGs_label, tree_bac$tip.label)])
# Make a vector with the internal node lables
node_vector_bac = c(tree_bac$tip.label,tree_bac$node.label)
# Search for your Phylum or Class
grep("Gammaproteobacteria", node_vector_bac, value = TRUE)
## [1] "'1.0:c__Gammaproteobacteria'"
match(grep("Gammaproteobacteria", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 3048
grep("Alphaproteobacteria", node_vector_bac, value = TRUE)
## [1] "'1.0:c__Alphaproteobacteria'"
match(grep("Alphaproteobacteria", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 3171
grep("Pseudomonadota", node_vector_bac, value = TRUE)
## [1] "'1.0:p__Pseudomonadota'"
match(grep("Pseudomonadota", node_vector_bac, value = TRUE), node_vector_bac)
## [1] 3047
tree_bac_node_Gammaproteobacteria <- Preorder(tree_bac)
tree_Gammaproteobacteria <- Subtree(tree_bac_node_Gammaproteobacteria, 3048)
# Make a vector with the internal node lables
node_vector_bac_TOOL_MAGS = c(tree_bac_TOOL_MAGs$tip.label,tree_bac_TOOL_MAGs$node.label)
grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE)
## [1] "'1.0:c__Gammaproteobacteria'"
match(grep("Gammaproteobacteria", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
## [1] 259
grep("Pseudomonadota", node_vector_bac_TOOL_MAGS, value = TRUE)
## [1] "'1.0:p__Pseudomonadota'"
match(grep("Pseudomonadota", node_vector_bac_TOOL_MAGS, value = TRUE), node_vector_bac_TOOL_MAGS)
## [1] 258
ggtree(tree_Gammaproteobacteria) +
geom_tiplab(size=3) +
xlim(0,20)
library(tidyverse)
library(knitr)
library(ggtree)
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
library(treeio)
library(tidytree)
library(ape)
library(TreeTools)
library(phytools)
library(ggnewscale)
library(ggtreeExtra)
library(ggstar)
library(ggplot2)
# library(ggeasy) #https://jonocarroll.github.io/ggeasy/
library(DT)
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
NEON_MAGs_metagenomes_chemistry_Ind<- NEON_MAGs_metagenomes_chemistry %>%
filter(`Assembly Type`=="Individual")
#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Ind <- NEON_MAGs_metagenomes_chemistry_Ind %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
#Toolik
NEON_MAGs_metagenomes_chemistry_TOOL<- NEON_MAGs_metagenomes_chemistry %>%
filter(`Site ID.x` == "TOOL")
NEON_MAGs_metagenomes_chemistry_TOOL_Ind<- NEON_MAGs_metagenomes_chemistry_Ind %>%
filter(`Site ID.x` == "TOOL")
TOOL_MAGs_label <- NEON_MAGs_metagenomes_chemistry_TOOL$label
tree_bac_TOOL_MAGs <-drop.tip(tree_bac,tree_bac$tip.label[-match(TOOL_MAGs_label, tree_bac$tip.label)])
#finding notable gammaproteobacteria
datatable(NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
count(Order, sort=TRUE))
#steroidobacterales, Burkholderiales, xanthomodales
datatable(NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
count(Genus, sort=TRUE))
#caballeronia, rhodanobacter, herbspirillum, paraburkholderia
datatable(NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
count(Species, sort=TRUE))
#Novel bacteria in the Class Gammaproteobacteria
datatable (NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel %>%
count(Order, sort=TRUE))
library(tidyverse)
library(knitr)
library(ggtree)
library(TDbook) #A Companion Package for the Book "Data Integration, Manipulation and Visualization of Phylogenetic Trees" by Guangchuang Yu (2022, ISBN:9781032233574).
library(ggimage)
library(rphylopic)
library(treeio)
library(tidytree)
library(ape)
library(TreeTools)
library(phytools)
library(ggnewscale)
library(ggtreeExtra)
library(ggstar)
#library(ggeasy) #https://jonocarroll.github.io/ggeasy/
library(DT)
NEON_MAGs <- read_csv("data/NEON/GOLD_Study_ID_Gs0161344_NEON_2024_4_21.csv") %>%
# remove columns that are not needed for data analysis
select(-c(`GOLD Study ID`, `Bin Methods`, `Created By`, `Date Added`, `Bin Lineage`)) %>%
# create a new column with the Assembly Type
mutate("Assembly Type" = case_when(`Genome Name` == "NEON combined assembly" ~ `Genome Name`,
TRUE ~ "Individual")) %>%
mutate_at("Assembly Type", str_replace, "NEON combined assembly", "Combined") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "d__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "p__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "c__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "o__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "f__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "g__", "") %>%
mutate_at("GTDB-Tk Taxonomy Lineage", str_replace, "s__", "") %>%
separate(`GTDB-Tk Taxonomy Lineage`, c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), ";", remove = FALSE) %>%
mutate_at("Domain", na_if,"") %>%
mutate_at("Phylum", na_if,"") %>%
mutate_at("Class", na_if,"") %>%
mutate_at("Order", na_if,"") %>%
mutate_at("Family", na_if,"") %>%
mutate_at("Genus", na_if,"") %>%
mutate_at("Species", na_if,"") %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "S-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_metagenomes <- read_tsv("data/NEON/exported_img_data_Gs0161344_NEON.tsv") %>%
select(-c(`Domain`, `Sequencing Status`, `Sequencing Center`)) %>%
rename(`Genome Name` = `Genome Name / Sample Name`) %>%
filter(str_detect(`Genome Name`, 're-annotation', negate = T)) %>%
filter(str_detect(`Genome Name`, 'WREF plot', negate = T))
NEON_metagenomes <- NEON_metagenomes %>%
# Get rid of the the common string "Soil microbial communities from "
mutate_at("Genome Name", str_replace, "Terrestrial soil microbial communities from ", "") %>%
# Use the first `-` to split the column in two
separate(`Genome Name`, c("Site","Sample Name"), " - ") %>%
# Get rid of the the common string "-comp-1"
mutate_at("Sample Name", str_replace, "-comp-1", "") %>%
# separate the Sample Name into Site ID and plot info
separate(`Sample Name`, c("Site ID","subplot.layer.date"), "_", remove = FALSE,) %>%
# separate the plot info into 3 columns
separate(`subplot.layer.date`, c("Subplot", "Layer", "Date"), "-")
NEON_chemistry <- read_tsv("data/NEON/neon_plot_soilChem1_metadata.tsv") %>%
# remove -COMP from genomicsSampleID
mutate_at("genomicsSampleID", str_replace, "-COMP", "")
NEON_MAGs_metagenomes_chemistry <- NEON_MAGs %>%
left_join(NEON_metagenomes, by = "Sample Name") %>%
left_join(NEON_chemistry, by = c("Sample Name" = "genomicsSampleID")) %>%
rename("label" = "Bin ID")
tree_arc <- read.tree("data/NEON/gtdbtk.ar53.decorated.tree")
tree_bac <- read.tree("data/NEON/gtdbtk.bac120.decorated.tree")
#Gammaproteobacteria
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria <- NEON_MAGs_metagenomes_chemistry %>%
filter(str_detect(`Class`,"Gammaproteobacteria"))
#Novel
NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria_Novel <- NEON_MAGs_metagenomes_chemistry_Gammaproteobacteria %>%
filter(is.na(Order) | is.na(Family) | is.na(Genus) | is.na(Species))
#almost are novel only two have species names
#finding notable gammaproteobacteria
datatable(NEON_MAGs_metagenomes_chemistry_TOOL %>%
count(Order, sort=TRUE))
#steroidobacterales, Burkholderiales, xanthomodales
datatable(NEON_MAGs_metagenomes_chemistry_TOOL %>%
count(Genus, sort=TRUE))
#caballeronia, rhodanobacter, herbspirillum, paraburkholderia
datatable(NEON_MAGs_metagenomes_chemistry_TOOL %>%
count(Species, sort=TRUE))
#Novel bacteria in the Class Gammaproteobacteria
datatable (NEON_MAGs_metagenomes_chemistry_TOOL_Novel %>%
count(Order, sort=TRUE))
Yu G (2022). Data Integration, Manipulation and Visualization of Phylogenetic Treess, 1st edition edition. Chapman and Hall/CRC. doi:10.1201/9781003279242, https://www.amazon.com/Integration-Manipulation-Visualization-Phylogenetic-Computational-ebook/dp/B0B5NLZR1Z/.
Xu S, Li L, Luo X, Chen M, Tang W, Zhan L, Dai Z, Tommy T. Lam, Guan Y, Yu G (2022). “Ggtree: A serialized data object for visualization of a phylogenetic tree and annotation data.” iMeta, 1(4), e56. doi:10.1002/imt2.56, https://onlinelibrary.wiley.com/doi/full/10.1002/imt2.56.
Yu G (2020). “Using ggtree to Visualize Data on Tree-Like Structures.” Current Protocols in Bioinformatics, 69(1), e96. doi:10.1002/cpbi.96, https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.96.
Yu G, Lam TT, Zhu H, Guan Y (2018). “Two methods for mapping and visualizing associated data on phylogeny using ggtree.” Molecular Biology and Evolution, 35, 3041-3043. doi:10.1093/molbev/msy194, https://academic.oup.com/mbe/article/35/12/3041/5142656.
Yu G, Smith D, Zhu H, Guan Y, Lam TT (2017). “ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data.” Methods in Ecology and Evolution, 8, 28-36. doi:10.1111/2041-210X.12628, http://onlinelibrary.wiley.com/doi/10.1111/2041-210X.12628/abstract.